aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-05-16 19:46:52 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-05-16 19:46:52 +0000
commit6b3f41ed88e8e440e11a4fbf20b6600529f80049 (patch)
tree928b056f24a634d628c80238dbbf10d41b1a71d5
parentc46e6a5940c50058e00c0c5f9123fd82e338d29a (diff)
downloadsrc-6b3f41ed88e8e440e11a4fbf20b6600529f80049.tar.gz
src-6b3f41ed88e8e440e11a4fbf20b6600529f80049.zip
Vendor import of llvm trunk r303197:vendor/llvm/llvm-trunk-r303197
Notes
Notes: svn path=/vendor/llvm/dist/; revision=318368 svn path=/vendor/llvm/llvm-trunk-r303197/; revision=318369; tag=vendor/llvm/llvm-trunk-r303197
-rw-r--r--CREDITS.TXT2
-rwxr-xr-xcmake/config-ix.cmake10
-rw-r--r--cmake/modules/AddSphinxTarget.cmake13
-rw-r--r--docs/CMakeLists.txt2
-rw-r--r--docs/GettingStarted.rst4
-rw-r--r--docs/LangRef.rst348
-rw-r--r--docs/Lexicon.rst8
-rw-r--r--docs/LibFuzzer.rst19
-rw-r--r--docs/ReleaseNotes.rst4
-rw-r--r--include/llvm/ADT/APInt.h49
-rw-r--r--include/llvm/ADT/BitVector.h2
-rw-r--r--include/llvm/ADT/STLExtras.h14
-rw-r--r--include/llvm/ADT/StringExtras.h7
-rw-r--r--include/llvm/Analysis/CallGraph.h10
-rw-r--r--include/llvm/Analysis/ProfileSummaryInfo.h4
-rw-r--r--include/llvm/Analysis/ScalarEvolution.h21
-rw-r--r--include/llvm/Analysis/TargetLibraryInfo.def127
-rw-r--r--include/llvm/Analysis/TargetTransformInfo.h33
-rw-r--r--include/llvm/Analysis/TargetTransformInfoImpl.h12
-rw-r--r--include/llvm/Analysis/ValueTracking.h13
-rw-r--r--include/llvm/Bitcode/BitcodeReader.h5
-rw-r--r--include/llvm/CodeGen/ExpandReductions.h24
-rw-r--r--include/llvm/CodeGen/GlobalISel/LegalizerInfo.h17
-rw-r--r--include/llvm/CodeGen/GlobalISel/Utils.h3
-rw-r--r--include/llvm/CodeGen/ISDOpcodes.h21
-rw-r--r--include/llvm/CodeGen/MachineCombinerPattern.h2
-rw-r--r--include/llvm/CodeGen/Passes.h12
-rw-r--r--include/llvm/CodeGen/SelectionDAG.h14
-rw-r--r--include/llvm/DebugInfo/CodeView/CVTypeVisitor.h4
-rw-r--r--include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h103
-rw-r--r--include/llvm/DebugInfo/CodeView/TypeDatabase.h23
-rw-r--r--include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h13
-rw-r--r--include/llvm/DebugInfo/CodeView/TypeDeserializer.h4
-rw-r--r--include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h1
-rw-r--r--include/llvm/DebugInfo/CodeView/TypeIndex.h62
-rw-r--r--include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h8
-rw-r--r--include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h9
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFContext.h7
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFDebugLine.h2
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h7
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFRelocMap.h12
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFVerifier.h6
-rw-r--r--include/llvm/DebugInfo/PDB/Native/RawTypes.h7
-rw-r--r--include/llvm/DebugInfo/PDB/Native/TpiStream.h4
-rw-r--r--include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h2
-rw-r--r--include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h13
-rw-r--r--include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h29
-rw-r--r--include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h4
-rw-r--r--include/llvm/ExecutionEngine/RTDyldMemoryManager.h16
-rw-r--r--include/llvm/ExecutionEngine/RuntimeDyld.h3
-rw-r--r--include/llvm/IR/Attributes.h41
-rw-r--r--include/llvm/IR/CallingConv.h4
-rw-r--r--include/llvm/IR/Constants.h10
-rw-r--r--include/llvm/IR/DebugInfoMetadata.h37
-rw-r--r--include/llvm/IR/DebugLoc.h16
-rw-r--r--include/llvm/IR/DerivedTypes.h42
-rw-r--r--include/llvm/IR/DiagnosticInfo.h23
-rw-r--r--include/llvm/IR/Function.h38
-rw-r--r--include/llvm/IR/GetElementPtrTypeIterator.h8
-rw-r--r--include/llvm/IR/GlobalAlias.h8
-rw-r--r--include/llvm/IR/GlobalIFunc.h8
-rw-r--r--include/llvm/IR/GlobalObject.h4
-rw-r--r--include/llvm/IR/GlobalValue.h20
-rw-r--r--include/llvm/IR/GlobalVariable.h63
-rw-r--r--include/llvm/IR/IRBuilder.h39
-rw-r--r--include/llvm/IR/InstrTypes.h18
-rw-r--r--include/llvm/IR/Instruction.h6
-rw-r--r--include/llvm/IR/Instructions.h149
-rw-r--r--include/llvm/IR/Intrinsics.td44
-rw-r--r--include/llvm/IR/LLVMContext.h12
-rw-r--r--include/llvm/IR/LegacyPassManager.h3
-rw-r--r--include/llvm/IR/Module.h100
-rw-r--r--include/llvm/IR/ModuleSummaryIndex.h59
-rw-r--r--include/llvm/IR/PassManager.h122
-rw-r--r--include/llvm/IR/PassManagerInternal.h11
-rw-r--r--include/llvm/IR/PatternMatch.h39
-rw-r--r--include/llvm/IR/ProfileSummary.h20
-rw-r--r--include/llvm/IR/Statepoint.h16
-rw-r--r--include/llvm/IR/SymbolTableListTraits.h10
-rw-r--r--include/llvm/IR/TrackingMDRef.h16
-rw-r--r--include/llvm/IR/Type.h45
-rw-r--r--include/llvm/IR/TypeFinder.h4
-rw-r--r--include/llvm/IR/Use.h28
-rw-r--r--include/llvm/IR/UseListOrder.h2
-rw-r--r--include/llvm/IR/User.h19
-rw-r--r--include/llvm/IR/Value.h23
-rw-r--r--include/llvm/IR/ValueHandle.h48
-rw-r--r--include/llvm/IR/ValueMap.h42
-rw-r--r--include/llvm/IR/ValueSymbolTable.h6
-rw-r--r--include/llvm/IR/Verifier.h15
-rw-r--r--include/llvm/InitializePasses.h5
-rw-r--r--include/llvm/LinkAllPasses.h1
-rw-r--r--include/llvm/Object/Wasm.h7
-rw-r--r--include/llvm/ObjectYAML/WasmYAML.h23
-rw-r--r--include/llvm/ProfileData/SampleProfWriter.h11
-rw-r--r--include/llvm/Support/BinaryStreamArray.h5
-rw-r--r--include/llvm/Support/Compiler.h8
-rw-r--r--include/llvm/Support/KnownBits.h60
-rw-r--r--include/llvm/Support/Parallel.h249
-rw-r--r--include/llvm/Support/Wasm.h23
-rw-r--r--include/llvm/Target/Target.td5
-rw-r--r--include/llvm/Target/TargetInstrInfo.h13
-rw-r--r--include/llvm/Target/TargetLowering.h24
-rw-r--r--include/llvm/Target/TargetSchedule.td2
-rw-r--r--include/llvm/Target/TargetSelectionDAG.td2
-rw-r--r--include/llvm/ToolDrivers/llvm-lib/LibDriver.h (renamed from include/llvm/LibDriver/LibDriver.h)6
-rw-r--r--include/llvm/Transforms/Utils/Cloning.h7
-rw-r--r--include/llvm/Transforms/Utils/LoopUtils.h32
-rw-r--r--include/llvm/Transforms/Vectorize/SLPVectorizer.h4
-rw-r--r--include/llvm/module.modulemap1
-rw-r--r--lib/Analysis/BasicAliasAnalysis.cpp9
-rw-r--r--lib/Analysis/BranchProbabilityInfo.cpp11
-rw-r--r--lib/Analysis/CallGraph.cpp34
-rw-r--r--lib/Analysis/ConstantFolding.cpp80
-rw-r--r--lib/Analysis/DemandedBits.cpp10
-rw-r--r--lib/Analysis/InlineCost.cpp2
-rw-r--r--lib/Analysis/InstructionSimplify.cpp167
-rw-r--r--lib/Analysis/ModuleSummaryAnalysis.cpp3
-rw-r--r--lib/Analysis/OptimizationDiagnosticInfo.cpp2
-rw-r--r--lib/Analysis/ProfileSummaryInfo.cpp13
-rw-r--r--lib/Analysis/ScalarEvolution.cpp159
-rw-r--r--lib/Analysis/TargetLibraryInfo.cpp112
-rw-r--r--lib/Analysis/TargetTransformInfo.cpp13
-rw-r--r--lib/Analysis/ValueTracking.cpp276
-rw-r--r--lib/Analysis/VectorUtils.cpp1
-rw-r--r--lib/AsmParser/LLParser.cpp18
-rw-r--r--lib/Bitcode/Reader/BitcodeReader.cpp17
-rw-r--r--lib/Bitcode/Reader/MetadataLoader.cpp2
-rw-r--r--lib/Bitcode/Writer/BitcodeWriter.cpp8
-rw-r--r--lib/Bitcode/Writer/ValueEnumerator.cpp7
-rw-r--r--lib/CMakeLists.txt2
-rw-r--r--lib/CodeGen/AsmPrinter/CodeViewDebug.cpp14
-rw-r--r--lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp3
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp42
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfCompileUnit.h22
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfDebug.cpp89
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfDebug.h20
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfFile.h4
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfUnit.cpp6
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfUnit.h5
-rw-r--r--lib/CodeGen/AsmPrinter/WinException.cpp12
-rw-r--r--lib/CodeGen/AtomicExpandPass.cpp31
-rw-r--r--lib/CodeGen/CMakeLists.txt3
-rw-r--r--lib/CodeGen/CodeGen.cpp4
-rw-r--r--lib/CodeGen/CodeGenPrepare.cpp548
-rw-r--r--lib/CodeGen/ExpandPostRAPseudos.cpp5
-rw-r--r--lib/CodeGen/ExpandReductions.cpp167
-rw-r--r--lib/CodeGen/GlobalISel/LegalizerInfo.cpp10
-rw-r--r--lib/CodeGen/GlobalISel/RegBankSelect.cpp9
-rw-r--r--lib/CodeGen/GlobalISel/Utils.cpp8
-rw-r--r--lib/CodeGen/IfConversion.cpp30
-rw-r--r--lib/CodeGen/LiveRangeShrink.cpp211
-rw-r--r--lib/CodeGen/LiveVariables.cpp2
-rw-r--r--lib/CodeGen/MachineBlockPlacement.cpp30
-rw-r--r--lib/CodeGen/MachineVerifier.cpp4
-rw-r--r--lib/CodeGen/PHIElimination.cpp2
-rw-r--r--lib/CodeGen/RegisterCoalescer.cpp2
-rw-r--r--lib/CodeGen/RegisterScavenging.cpp7
-rw-r--r--lib/CodeGen/SafeStack.cpp172
-rw-r--r--lib/CodeGen/ScalarizeMaskedMemIntrin.cpp660
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp176
-rw-r--r--lib/CodeGen/SelectionDAG/FastISel.cpp20
-rw-r--r--lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp33
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeDAG.cpp3
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeTypes.h1
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp58
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAG.cpp162
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp138
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h6
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp13
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp74
-rw-r--r--lib/CodeGen/SelectionDAG/TargetLowering.cpp11
-rw-r--r--lib/CodeGen/ShrinkWrap.cpp12
-rw-r--r--lib/CodeGen/SjLjEHPrepare.cpp4
-rw-r--r--lib/CodeGen/TargetLoweringObjectFileImpl.cpp6
-rw-r--r--lib/CodeGen/TargetPassConfig.cpp11
-rw-r--r--lib/CodeGen/TwoAddressInstructionPass.cpp7
-rw-r--r--lib/CodeGen/UnreachableBlockElim.cpp7
-rw-r--r--lib/DebugInfo/CodeView/CMakeLists.txt2
-rw-r--r--lib/DebugInfo/CodeView/CVTypeVisitor.cpp41
-rw-r--r--lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp10
-rw-r--r--lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp91
-rw-r--r--lib/DebugInfo/CodeView/TypeDatabase.cpp73
-rw-r--r--lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp65
-rw-r--r--lib/DebugInfo/CodeView/TypeDumpVisitor.cpp9
-rw-r--r--lib/DebugInfo/DWARF/DWARFContext.cpp25
-rw-r--r--lib/DebugInfo/DWARF/DWARFDebugAranges.cpp5
-rw-r--r--lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp4
-rw-r--r--lib/DebugInfo/DWARF/DWARFDie.cpp12
-rw-r--r--lib/DebugInfo/DWARF/DWARFTypeUnit.cpp6
-rw-r--r--lib/DebugInfo/DWARF/DWARFUnit.cpp16
-rw-r--r--lib/DebugInfo/DWARF/DWARFVerifier.cpp8
-rw-r--r--lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp2
-rw-r--r--lib/ExecutionEngine/Orc/OrcMCJITReplacement.h5
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp12
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp4
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp33
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h2
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h2
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h1
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h1
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h3
-rw-r--r--lib/Fuzzer/FuzzerDriver.cpp3
-rw-r--r--lib/Fuzzer/FuzzerFlags.def8
-rw-r--r--lib/Fuzzer/FuzzerInternal.h1
-rw-r--r--lib/Fuzzer/FuzzerLoop.cpp20
-rw-r--r--lib/Fuzzer/FuzzerMutate.cpp5
-rw-r--r--lib/Fuzzer/afl/afl_driver.cpp57
-rw-r--r--lib/Fuzzer/test/AFLDriverTest.cpp8
-rw-r--r--lib/Fuzzer/test/CMakeLists.txt1
-rw-r--r--lib/Fuzzer/test/OverwriteInputTest.cpp13
-rw-r--r--lib/Fuzzer/test/afl-driver.test26
-rw-r--r--lib/Fuzzer/test/overwrite-input.test2
-rw-r--r--lib/IR/AsmWriter.cpp7
-rw-r--r--lib/IR/AttributeImpl.h18
-rw-r--r--lib/IR/Attributes.cpp72
-rw-r--r--lib/IR/ConstantFold.cpp10
-rw-r--r--lib/IR/ConstantRange.cpp37
-rw-r--r--lib/IR/Constants.cpp15
-rw-r--r--lib/IR/ConstantsContext.h49
-rw-r--r--lib/IR/DebugInfoMetadata.cpp18
-rw-r--r--lib/IR/DebugLoc.cpp114
-rw-r--r--lib/IR/DiagnosticInfo.cpp25
-rw-r--r--lib/IR/Function.cpp79
-rw-r--r--lib/IR/Globals.cpp43
-rw-r--r--lib/IR/IRBuilder.cpp88
-rw-r--r--lib/IR/Instruction.cpp24
-rw-r--r--lib/IR/Instructions.cpp119
-rw-r--r--lib/IR/LegacyPassManager.cpp13
-rw-r--r--lib/IR/Module.cpp35
-rw-r--r--lib/IR/Type.cpp71
-rw-r--r--lib/IR/Verifier.cpp13
-rw-r--r--lib/LLVMBuild.txt2
-rw-r--r--lib/LTO/LTO.cpp4
-rw-r--r--lib/LTO/LTOCodeGenerator.cpp18
-rw-r--r--lib/LTO/ThinLTOCodeGenerator.cpp3
-rw-r--r--lib/Linker/IRMover.cpp18
-rw-r--r--lib/MC/MCObjectStreamer.cpp5
-rw-r--r--lib/MC/MCParser/AsmParser.cpp21
-rw-r--r--lib/Object/COFFObjectFile.cpp4
-rw-r--r--lib/Object/WasmObjectFile.cpp41
-rw-r--r--lib/ObjectYAML/WasmYAML.cpp8
-rw-r--r--lib/ProfileData/SampleProfWriter.cpp42
-rw-r--r--lib/Support/APInt.cpp312
-rw-r--r--lib/Support/CMakeLists.txt1
-rw-r--r--lib/Support/Parallel.cpp138
-rw-r--r--lib/Support/Unix/Path.inc30
-rw-r--r--lib/Support/Unix/Process.inc4
-rw-r--r--lib/Target/AArch64/AArch64.td1
-rw-r--r--lib/Target/AArch64/AArch64CallLowering.cpp2
-rw-r--r--lib/Target/AArch64/AArch64FastISel.cpp2
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp6
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp30
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td8
-rw-r--r--lib/Target/AArch64/AArch64RegisterBankInfo.cpp27
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkorDetails.td78
-rw-r--r--lib/Target/AArch64/AArch64SchedFalkorWriteRes.td37
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp8
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h10
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.cpp8
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.h3
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.cpp106
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.h11
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp19
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td21
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp8
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp10
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp1
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp2881
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp7
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h15
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp18
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp34
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h3
-rw-r--r--lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td34
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp153
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.h61
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp282
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.h24
-rw-r--r--lib/Target/AMDGPU/SIAnnotateControlFlow.cpp2
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp22
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp310
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h33
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td7
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td51
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td3
-rw-r--r--lib/Target/ARM/ARMBaseInstrInfo.h18
-rw-r--r--lib/Target/ARM/ARMCallLowering.cpp2
-rw-r--r--lib/Target/ARM/ARMFastISel.cpp2
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp27
-rw-r--r--lib/Target/ARM/ARMISelLowering.h8
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td7
-rw-r--r--lib/Target/ARM/ARMInstrThumb.td4
-rw-r--r--lib/Target/ARM/ARMInstructionSelector.cpp39
-rw-r--r--lib/Target/ARM/ARMLegalizerInfo.cpp8
-rw-r--r--lib/Target/ARM/ARMOptimizeBarriersPass.cpp4
-rw-r--r--lib/Target/ARM/ARMRegisterBankInfo.cpp1
-rw-r--r--lib/Target/ARM/ARMTargetMachine.cpp2
-rw-r--r--lib/Target/AVR/AVRFrameLowering.cpp2
-rw-r--r--lib/Target/AVR/AVRISelLowering.cpp10
-rw-r--r--lib/Target/AVR/AVRInstrInfo.td6
-rw-r--r--lib/Target/AVR/AVRRegisterInfo.cpp1
-rw-r--r--lib/Target/BPF/BPFISelLowering.cpp3
-rw-r--r--lib/Target/BPF/BPFInstrInfo.td9
-rw-r--r--lib/Target/Hexagon/HexagonISelLowering.cpp3
-rw-r--r--lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp2
-rw-r--r--lib/Target/Hexagon/HexagonPatterns.td7
-rw-r--r--lib/Target/Hexagon/HexagonPseudo.td2
-rw-r--r--lib/Target/Lanai/LanaiISelLowering.cpp31
-rw-r--r--lib/Target/Lanai/LanaiISelLowering.h5
-rw-r--r--lib/Target/Lanai/LanaiInstrInfo.td12
-rw-r--r--lib/Target/MSP430/MSP430FrameLowering.cpp7
-rw-r--r--lib/Target/MSP430/MSP430ISelLowering.cpp271
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.h6
-rw-r--r--lib/Target/MSP430/MSP430InstrInfo.td9
-rw-r--r--lib/Target/MSP430/MSP430RegisterInfo.cpp4
-rw-r--r--lib/Target/Mips/MipsFastISel.cpp2
-rw-r--r--lib/Target/Mips/MipsISelLowering.cpp2
-rw-r--r--lib/Target/Mips/MipsInstrInfo.td6
-rw-r--r--lib/Target/Mips/MipsOptimizePICCall.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.cpp18
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.td9
-rw-r--r--lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp17
-rw-r--r--lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp3
-rw-r--r--lib/Target/PowerPC/PPCFastISel.cpp3
-rw-r--r--lib/Target/PowerPC/PPCISelDAGToDAG.cpp255
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp98
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h29
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td28
-rw-r--r--lib/Target/PowerPC/PPCInstrAltivec.td40
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td13
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td2
-rw-r--r--lib/Target/PowerPC/PPCTLSDynamicCall.cpp3
-rw-r--r--lib/Target/Sparc/SparcISelLowering.cpp31
-rw-r--r--lib/Target/Sparc/SparcInstrInfo.td9
-rw-r--r--lib/Target/Sparc/SparcRegisterInfo.td6
-rw-r--r--lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp4
-rw-r--r--lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp19
-rw-r--r--lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp14
-rw-r--r--lib/Target/SystemZ/README.txt2
-rw-r--r--lib/Target/SystemZ/SystemZFeatures.td14
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.cpp13
-rw-r--r--lib/Target/SystemZ/SystemZISelLowering.h2
-rw-r--r--lib/Target/SystemZ/SystemZInstrFP.td13
-rw-r--r--lib/Target/SystemZ/SystemZInstrFormats.td301
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.td201
-rw-r--r--lib/Target/SystemZ/SystemZOperands.td2
-rw-r--r--lib/Target/SystemZ/SystemZOperators.td3
-rw-r--r--lib/Target/SystemZ/SystemZSchedule.td4
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ13.td84
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZ196.td92
-rw-r--r--lib/Target/SystemZ/SystemZScheduleZEC12.td92
-rw-r--r--lib/Target/SystemZ/SystemZSubtarget.cpp7
-rw-r--r--lib/Target/SystemZ/SystemZSubtarget.h10
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrCall.td4
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInfo.td3
-rw-r--r--lib/Target/X86/X86.td3
-rw-r--r--lib/Target/X86/X86FastISel.cpp48
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp269
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp41
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp214
-rw-r--r--lib/Target/X86/X86InstrCompiler.td14
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp52
-rw-r--r--lib/Target/X86/X86InstrInfo.h11
-rw-r--r--lib/Target/X86/X86InstrInfo.td35
-rw-r--r--lib/Target/X86/X86InstrSSE.td18
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp214
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h2
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp16
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp14
-rw-r--r--lib/Target/X86/X86Subtarget.h6
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp4
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp194
-rw-r--r--lib/Target/X86/X86WinEHState.cpp2
-rw-r--r--lib/Target/XCore/XCoreISelLowering.cpp5
-rw-r--r--lib/Target/XCore/XCoreInstrInfo.td11
-rw-r--r--lib/ToolDrivers/CMakeLists.txt1
-rw-r--r--lib/ToolDrivers/LLVMBuild.txt24
-rw-r--r--lib/ToolDrivers/llvm-lib/CMakeLists.txt (renamed from lib/LibDriver/CMakeLists.txt)0
-rw-r--r--lib/ToolDrivers/llvm-lib/LLVMBuild.txt (renamed from lib/LibDriver/LLVMBuild.txt)0
-rw-r--r--lib/ToolDrivers/llvm-lib/LibDriver.cpp (renamed from lib/LibDriver/LibDriver.cpp)2
-rw-r--r--lib/ToolDrivers/llvm-lib/Options.td (renamed from lib/LibDriver/Options.td)0
-rw-r--r--lib/Transforms/Coroutines/CoroFrame.cpp100
-rw-r--r--lib/Transforms/IPO/FunctionImport.cpp15
-rw-r--r--lib/Transforms/IPO/Inliner.cpp4
-rw-r--r--lib/Transforms/IPO/PartialInlining.cpp426
-rw-r--r--lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp6
-rw-r--r--lib/Transforms/InstCombine/InstCombineAddSub.cpp199
-rw-r--r--lib/Transforms/InstCombine/InstCombineAndOrXor.cpp109
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp8
-rw-r--r--lib/Transforms/InstCombine/InstCombineCasts.cpp25
-rw-r--r--lib/Transforms/InstCombine/InstCombineCompares.cpp39
-rw-r--r--lib/Transforms/InstCombine/InstCombineInternal.h30
-rw-r--r--lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp6
-rw-r--r--lib/Transforms/InstCombine/InstCombineMulDivRem.cpp20
-rw-r--r--lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp2
-rw-r--r--lib/Transforms/InstCombine/InstructionCombining.cpp26
-rw-r--r--lib/Transforms/Instrumentation/AddressSanitizer.cpp34
-rw-r--r--lib/Transforms/Instrumentation/DataFlowSanitizer.cpp8
-rw-r--r--lib/Transforms/Instrumentation/EfficiencySanitizer.cpp49
-rw-r--r--lib/Transforms/Instrumentation/MemorySanitizer.cpp7
-rw-r--r--lib/Transforms/Scalar/CorrelatedValuePropagation.cpp10
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp295
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp210
-rw-r--r--lib/Transforms/Scalar/SimpleLoopUnswitch.cpp161
-rw-r--r--lib/Transforms/Scalar/SpeculativeExecution.cpp43
-rw-r--r--lib/Transforms/Utils/BypassSlowDivision.cpp4
-rw-r--r--lib/Transforms/Utils/CloneFunction.cpp32
-rw-r--r--lib/Transforms/Utils/CloneModule.cpp2
-rw-r--r--lib/Transforms/Utils/EscapeEnumerator.cpp3
-rw-r--r--lib/Transforms/Utils/InlineFunction.cpp61
-rw-r--r--lib/Transforms/Utils/InstructionNamer.cpp13
-rw-r--r--lib/Transforms/Utils/Local.cpp106
-rw-r--r--lib/Transforms/Utils/LoopUtils.cpp201
-rw-r--r--lib/Transforms/Utils/ModuleUtils.cpp12
-rw-r--r--lib/Transforms/Utils/SimplifyLibCalls.cpp6
-rw-r--r--lib/Transforms/Utils/VNCoercion.cpp9
-rw-r--r--lib/Transforms/Utils/ValueMapper.cpp9
-rw-r--r--lib/Transforms/Vectorize/LoadStoreVectorizer.cpp2
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp241
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp112
-rw-r--r--lib/XRay/Trace.cpp34
-rw-r--r--projects/CMakeLists.txt4
-rw-r--r--test/Analysis/BasicAA/cs-cs-arm.ll34
-rw-r--r--test/Analysis/BasicAA/cs-cs.ll37
-rw-r--r--test/Analysis/BasicAA/intrinsics-arm.ll31
-rw-r--r--test/Analysis/BasicAA/intrinsics.ll34
-rw-r--r--test/Analysis/BranchProbabilityInfo/basic.ll6
-rw-r--r--test/Analysis/CostModel/AArch64/free-widening-casts.ll622
-rw-r--r--test/Analysis/CostModel/AMDGPU/extractelement.ll74
-rw-r--r--test/Analysis/CostModel/AMDGPU/insertelement.ll43
-rw-r--r--test/Analysis/CostModel/AMDGPU/shufflevector.ll43
-rw-r--r--test/Analysis/CostModel/X86/div.ll32
-rw-r--r--test/Analysis/CostModel/X86/vshift-ashr-cost.ll138
-rw-r--r--test/Analysis/CostModel/X86/vshift-lshr-cost.ll128
-rw-r--r--test/Analysis/CostModel/X86/vshift-shl-cost.ll134
-rw-r--r--test/Analysis/ScalarEvolution/different-loops-recs.ll454
-rw-r--r--test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll18
-rw-r--r--test/Assembler/globalvariable-attributes.ll19
-rw-r--r--test/Bitcode/globalvariable-attributes.ll19
-rw-r--r--test/Bitcode/ptest-old.ll1
-rw-r--r--test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll2
-rw-r--r--test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll121
-rw-r--r--test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir96
-rw-r--r--test/CodeGen/AArch64/GlobalISel/call-translator.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-ccmp.ll2
-rw-r--r--test/CodeGen/AArch64/arm64-fml-combines.ll24
-rw-r--r--test/CodeGen/AArch64/arm64-hello.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-misched-multimmo.ll2
-rw-r--r--test/CodeGen/AArch64/macho-global-symbols.ll17
-rw-r--r--test/CodeGen/AArch64/misched-fusion-aes.ll33
-rw-r--r--test/CodeGen/AArch64/stackmap-frame-setup.ll4
-rw-r--r--test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir2
-rw-r--r--test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir2
-rw-r--r--test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir20
-rw-r--r--test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg2
-rw-r--r--test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir70
-rw-r--r--test/CodeGen/AMDGPU/constant-fold-mi-operands.ll12
-rw-r--r--test/CodeGen/AMDGPU/ctpop.ll80
-rw-r--r--test/CodeGen/AMDGPU/ctpop64.ll16
-rw-r--r--test/CodeGen/AMDGPU/fneg-combines.ll9
-rw-r--r--test/CodeGen/AMDGPU/fneg.f16.ll39
-rw-r--r--test/CodeGen/AMDGPU/inserted-wait-states.mir10
-rw-r--r--test/CodeGen/AMDGPU/limit-coalesce.mir6
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll18
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll4
-rw-r--r--test/CodeGen/AMDGPU/madak.ll6
-rw-r--r--test/CodeGen/AMDGPU/promote-alloca-volatile.ll12
-rw-r--r--test/CodeGen/AMDGPU/v_madak_f16.ll2
-rw-r--r--test/CodeGen/AMDGPU/waitcnt.mir22
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir200
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll16
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-legalizer.mir30
-rw-r--r--test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir136
-rw-r--r--test/CodeGen/ARM/divmod-eabi.ll73
-rw-r--r--test/CodeGen/ARM/divmod.ll1
-rw-r--r--test/CodeGen/AVR/select-mbb-placement-bug.ll35
-rw-r--r--test/CodeGen/Generic/expand-experimental-reductions.ll210
-rw-r--r--test/CodeGen/Hexagon/regalloc-bad-undef.mir8
-rw-r--r--test/CodeGen/Lanai/masking_setccs.ll48
-rw-r--r--test/CodeGen/Lanai/peephole-compare.mir4
-rw-r--r--test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir24
-rw-r--r--test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir64
-rw-r--r--test/CodeGen/MIR/X86/frame-info-save-restore-points.mir2
-rw-r--r--test/CodeGen/MSP430/hwmult16.ll43
-rw-r--r--test/CodeGen/MSP430/hwmult32.ll43
-rw-r--r--test/CodeGen/MSP430/hwmultf5.ll43
-rw-r--r--test/CodeGen/MSP430/jumptable.ll2
-rw-r--r--test/CodeGen/MSP430/libcalls.ll595
-rw-r--r--test/CodeGen/MSP430/promote-i8-mul.ll (renamed from test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll)2
-rw-r--r--test/CodeGen/NVPTX/bug17709.ll52
-rw-r--r--test/CodeGen/NVPTX/ctlz.ll2
-rw-r--r--test/CodeGen/NVPTX/ctpop.ll2
-rw-r--r--test/CodeGen/NVPTX/cttz.ll3
-rw-r--r--test/CodeGen/NVPTX/f16-instructions.ll2157
-rw-r--r--test/CodeGen/NVPTX/f16x2-instructions.ll2853
-rw-r--r--test/CodeGen/NVPTX/fma.ll84
-rw-r--r--test/CodeGen/NVPTX/i8-param.ll46
-rw-r--r--test/CodeGen/NVPTX/param-load-store.ll1878
-rw-r--r--test/CodeGen/NVPTX/sched1.ll4
-rw-r--r--test/CodeGen/NVPTX/sched2.ll4
-rw-r--r--test/CodeGen/NVPTX/simple-call.ll52
-rw-r--r--test/CodeGen/NVPTX/vec8.ll2
-rw-r--r--test/CodeGen/NVPTX/vector-call.ll60
-rw-r--r--test/CodeGen/NVPTX/zeroext-32bit.ll52
-rw-r--r--test/CodeGen/PowerPC/mtvsrdd.ll22
-rw-r--r--test/CodeGen/PowerPC/setcc-logic.ll12
-rw-r--r--test/CodeGen/PowerPC/stackmap-frame-setup.ll4
-rw-r--r--test/CodeGen/PowerPC/tail-dup-layout.ll97
-rw-r--r--test/CodeGen/PowerPC/testComparesieqsc.ll138
-rw-r--r--test/CodeGen/PowerPC/testComparesieqsi.ll138
-rw-r--r--test/CodeGen/PowerPC/testComparesieqss.ll138
-rw-r--r--test/CodeGen/PowerPC/testComparesiequc.ll138
-rw-r--r--test/CodeGen/PowerPC/testComparesiequi.ll138
-rw-r--r--test/CodeGen/PowerPC/testComparesiequs.ll138
-rw-r--r--test/CodeGen/PowerPC/testCompareslleqsc.ll138
-rw-r--r--test/CodeGen/PowerPC/testCompareslleqsi.ll138
-rw-r--r--test/CodeGen/PowerPC/testCompareslleqss.ll137
-rw-r--r--test/CodeGen/PowerPC/testComparesllequc.ll137
-rw-r--r--test/CodeGen/PowerPC/testComparesllequi.ll137
-rw-r--r--test/CodeGen/PowerPC/testComparesllequs.ll137
-rw-r--r--test/CodeGen/SPARC/LeonItinerariesUT.ll4
-rw-r--r--test/CodeGen/SPARC/inlineasm-v9.ll30
-rw-r--r--test/CodeGen/SPARC/inlineasm.ll18
-rw-r--r--test/CodeGen/SystemZ/list-ilp-crash.ll23
-rw-r--r--test/CodeGen/SystemZ/lower-copy-undef-src.mir14
-rw-r--r--test/CodeGen/Thumb2/v8_IT_5.ll2
-rw-r--r--test/CodeGen/X86/2007-01-08-InstrSched.ll4
-rw-r--r--test/CodeGen/X86/2010-01-18-DbgValue.ll13
-rw-r--r--test/CodeGen/X86/2012-11-30-handlemove-dbg.ll51
-rw-r--r--test/CodeGen/X86/2012-11-30-misched-dbg.ll142
-rw-r--r--test/CodeGen/X86/2012-11-30-regpres-dbg.ll47
-rw-r--r--test/CodeGen/X86/GlobalISel/add-scalar.ll44
-rw-r--r--test/CodeGen/X86/GlobalISel/binop.ll42
-rw-r--r--test/CodeGen/X86/GlobalISel/br.ll19
-rw-r--r--test/CodeGen/X86/GlobalISel/cmp.ll159
-rw-r--r--test/CodeGen/X86/GlobalISel/ext-x86-64.ll14
-rw-r--r--test/CodeGen/X86/GlobalISel/ext.ll18
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-cmp.mir179
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir64
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-ext.mir64
-rw-r--r--test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll (renamed from test/CodeGen/X86/GlobalISel/memop-x32.ll)0
-rw-r--r--test/CodeGen/X86/GlobalISel/memop-scalar.ll (renamed from test/CodeGen/X86/GlobalISel/memop.ll)64
-rw-r--r--test/CodeGen/X86/GlobalISel/memop-vec.ll39
-rw-r--r--test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir125
-rw-r--r--test/CodeGen/X86/GlobalISel/select-br.mir39
-rw-r--r--test/CodeGen/X86/GlobalISel/select-cmp.mir563
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir38
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext.mir33
-rw-r--r--test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir (renamed from test/CodeGen/X86/GlobalISel/select-memop-x32.mir)0
-rw-r--r--test/CodeGen/X86/GlobalISel/select-memop-scalar.mir (renamed from test/CodeGen/X86/GlobalISel/select-memop.mir)137
-rw-r--r--test/CodeGen/X86/GlobalISel/select-memop-v128.mir143
-rw-r--r--test/CodeGen/X86/O0-pipeline.ll67
-rw-r--r--test/CodeGen/X86/all-ones-vector.ll112
-rw-r--r--test/CodeGen/X86/avg.ll833
-rw-r--r--test/CodeGen/X86/avx-basic.ll8
-rw-r--r--test/CodeGen/X86/avx-cvt-3.ll22
-rw-r--r--test/CodeGen/X86/avx-intrinsics-fast-isel.ll60
-rw-r--r--test/CodeGen/X86/avx-schedule.ll50
-rw-r--r--test/CodeGen/X86/avx.ll2
-rw-r--r--test/CodeGen/X86/avx512-cmp-kor-sequence.ll6
-rw-r--r--test/CodeGen/X86/avx512-gather-scatter-intrin.ll10
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-upgrade.ll44
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll215
-rw-r--r--test/CodeGen/X86/avx512-mask-spills.ll40
-rw-r--r--test/CodeGen/X86/avx512-scalar_mask.ll107
-rw-r--r--test/CodeGen/X86/avx512-vselect.ll61
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll12
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll16
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll24
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll2
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics.ll2
-rw-r--r--test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll2
-rw-r--r--test/CodeGen/X86/avx512dq-intrinsics.ll4
-rw-r--r--test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll10
-rw-r--r--test/CodeGen/X86/avx512dqvl-intrinsics.ll4
-rw-r--r--test/CodeGen/X86/avx512er-intrinsics.ll48
-rw-r--r--test/CodeGen/X86/avx512ifma-intrinsics.ll8
-rw-r--r--test/CodeGen/X86/avx512ifmavl-intrinsics.ll16
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll64
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll28
-rw-r--r--test/CodeGen/X86/bmi.ll76
-rw-r--r--test/CodeGen/X86/bswap_tree2.ll35
-rw-r--r--test/CodeGen/X86/cast-vsel.ll37
-rw-r--r--test/CodeGen/X86/combine-abs.ll11
-rw-r--r--test/CodeGen/X86/combine-shl.ll3
-rw-r--r--test/CodeGen/X86/combine-srl.ll22
-rw-r--r--test/CodeGen/X86/constructor.ll5
-rw-r--r--test/CodeGen/X86/dbg-baseptr.ll62
-rw-r--r--test/CodeGen/X86/elf-associated.ll5
-rw-r--r--test/CodeGen/X86/fold-tied-op.ll7
-rw-r--r--test/CodeGen/X86/fp128-i128.ll2
-rw-r--r--test/CodeGen/X86/haddsub-2.ll12
-rw-r--r--test/CodeGen/X86/known-signbits-vector.ll61
-rw-r--r--test/CodeGen/X86/leaFixup32.mir508
-rw-r--r--test/CodeGen/X86/leaFixup64.mir1041
-rw-r--r--test/CodeGen/X86/lrshrink.ll57
-rw-r--r--test/CodeGen/X86/madd.ll34
-rw-r--r--test/CodeGen/X86/masked_gather_scatter.ll2
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-128.ll16
-rw-r--r--test/CodeGen/X86/misched-matrix.ll4
-rw-r--r--test/CodeGen/X86/not-and-simplify.ll28
-rw-r--r--test/CodeGen/X86/oddshuffles.ll34
-rw-r--r--test/CodeGen/X86/packss.ll11
-rw-r--r--test/CodeGen/X86/pmul.ll55
-rw-r--r--test/CodeGen/X86/pr28129.ll32
-rw-r--r--test/CodeGen/X86/pr29112.ll8
-rw-r--r--test/CodeGen/X86/pr30562.ll1
-rw-r--r--test/CodeGen/X86/pr31088.ll2
-rw-r--r--test/CodeGen/X86/pr32284.ll71
-rw-r--r--test/CodeGen/X86/pr32907.ll53
-rw-r--r--test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll37
-rw-r--r--test/CodeGen/X86/rotate.ll16
-rw-r--r--test/CodeGen/X86/sad.ll929
-rw-r--r--test/CodeGen/X86/select.ll28
-rw-r--r--test/CodeGen/X86/setcc-wide-types.ll56
-rw-r--r--test/CodeGen/X86/shrink_vmul_sse.ll2
-rw-r--r--test/CodeGen/X86/shuffle-of-splat-multiuses.ll34
-rw-r--r--test/CodeGen/X86/sse-intrinsics-fast-isel.ll10
-rw-r--r--test/CodeGen/X86/sse1.ll20
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub-2.ll14
-rw-r--r--test/CodeGen/X86/sse41.ll8
-rw-r--r--test/CodeGen/X86/stackmap-frame-setup.ll4
-rw-r--r--test/CodeGen/X86/vec_int_to_fp.ll84
-rw-r--r--test/CodeGen/X86/vec_set-2.ll31
-rw-r--r--test/CodeGen/X86/vec_set-3.ll45
-rw-r--r--test/CodeGen/X86/vec_set-4.ll38
-rw-r--r--test/CodeGen/X86/vec_set-6.ll23
-rw-r--r--test/CodeGen/X86/vec_set-7.ll18
-rw-r--r--test/CodeGen/X86/vec_set-8.ll16
-rw-r--r--test/CodeGen/X86/vec_set-A.ll19
-rw-r--r--test/CodeGen/X86/vec_set-B.ll40
-rw-r--r--test/CodeGen/X86/vec_set-C.ll10
-rw-r--r--test/CodeGen/X86/vec_set.ll63
-rw-r--r--test/CodeGen/X86/vector-bitreverse.ll6
-rw-r--r--test/CodeGen/X86/vector-blend.ll4
-rw-r--r--test/CodeGen/X86/vector-lzcnt-128.ll380
-rw-r--r--test/CodeGen/X86/vector-lzcnt-256.ll536
-rw-r--r--test/CodeGen/X86/vector-narrow-binop.ll111
-rw-r--r--test/CodeGen/X86/vector-pcmp.ll27
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-256.ll580
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-256.ll434
-rw-r--r--test/CodeGen/X86/vector-shift-shl-256.ll377
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v32.ll356
-rw-r--r--test/CodeGen/X86/vector-sqrt.ll8
-rw-r--r--test/CodeGen/X86/viabs.ll107
-rw-r--r--test/CodeGen/X86/vselect-pcmp.ll12
-rw-r--r--test/CodeGen/X86/x86-interleaved-access.ll14
-rw-r--r--test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll26
-rw-r--r--test/CodeGen/X86/x86-no_caller_saved_registers.ll62
-rw-r--r--test/CodeGen/X86/x86-shrink-wrapping.ll53
-rw-r--r--test/CodeGen/X86/xop-intrinsics-fast-isel.ll8
-rw-r--r--test/DebugInfo/COFF/local-variables.ll5
-rw-r--r--test/DebugInfo/COFF/no-cus.ll25
-rw-r--r--test/DebugInfo/Inputs/typeunit-header.elf-x86-64bin0 -> 840 bytes
-rw-r--r--test/DebugInfo/Inputs/typeunit-header.s49
-rw-r--r--test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test4
-rw-r--r--test/DebugInfo/X86/dbg-declare-inalloca.ll199
-rw-r--r--test/DebugInfo/X86/split-dwarf-cross-unit-reference.ll198
-rw-r--r--test/DebugInfo/typeunit-header.test15
-rw-r--r--test/Feature/intrinsic-noduplicate.ll1
-rw-r--r--test/Instrumentation/MemorySanitizer/msan_basic.ll64
-rw-r--r--test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll68
-rw-r--r--test/Instrumentation/MemorySanitizer/pr32842.ll20
-rw-r--r--test/Instrumentation/MemorySanitizer/vector_arith.ll1
-rw-r--r--test/Instrumentation/MemorySanitizer/vector_cmp.ll1
-rw-r--r--test/Instrumentation/MemorySanitizer/vector_cvt.ll1
-rw-r--r--test/Instrumentation/MemorySanitizer/vector_pack.ll1
-rw-r--r--test/Instrumentation/MemorySanitizer/vector_shift.ll1
-rw-r--r--test/LTO/Resolution/X86/ifunc.ll15
-rw-r--r--test/MC/AArch64/directive-cpu-err.s9
-rw-r--r--test/MC/AArch64/label-arithmetic-diags-elf.s9
-rw-r--r--test/MC/AMDGPU/flat.s66
-rw-r--r--test/MC/AMDGPU/literal16.s8
-rw-r--r--test/MC/AMDGPU/vop2.s38
-rw-r--r--test/MC/AMDGPU/vop3-convert.s14
-rw-r--r--test/MC/AsmParser/altmacro_string_escape.s29
-rw-r--r--test/MC/Disassembler/AMDGPU/flat_vi.txt24
-rw-r--r--test/MC/Disassembler/AMDGPU/literal16_vi.txt6
-rw-r--r--test/MC/Disassembler/AMDGPU/vop2_vi.txt30
-rw-r--r--test/MC/Disassembler/AMDGPU/vop3_vi.txt18
-rw-r--r--test/MC/Disassembler/PowerPC/ppc64-encoding-p9vector.txt4
-rw-r--r--test/MC/Disassembler/SystemZ/insns-z13.txt4056
-rw-r--r--test/MC/Disassembler/SystemZ/insns.txt6511
-rw-r--r--test/MC/SystemZ/insn-bad-z13.s792
-rw-r--r--test/MC/SystemZ/insn-bad-z196.s53
-rw-r--r--test/MC/SystemZ/insn-bad-zEC12.s511
-rw-r--r--test/MC/SystemZ/insn-bad.s2264
-rw-r--r--test/MC/SystemZ/insn-good-z13.s1710
-rw-r--r--test/MC/SystemZ/insn-good-z196.s158
-rw-r--r--test/MC/SystemZ/insn-good-zEC12.s16
-rw-r--r--test/MC/SystemZ/insn-good.s2131
-rw-r--r--test/Object/Inputs/COFF/empty-drectve.yaml14
-rw-r--r--test/Object/X86/archive-symbol-table.s19
-rw-r--r--test/Object/X86/nm-ir.ll2
-rw-r--r--test/Object/coff-empty-drectve.test3
-rw-r--r--test/Object/invalid.test4
-rw-r--r--test/Object/wasm-invalid-start.test10
-rw-r--r--test/ObjectYAML/wasm/export_section.yaml28
-rw-r--r--test/ObjectYAML/wasm/function_section.yaml4
-rw-r--r--test/ObjectYAML/wasm/import_section.yaml45
-rw-r--r--test/ObjectYAML/wasm/start_section.yaml9
-rw-r--r--test/TableGen/AsmVariant.td2
-rw-r--r--test/TableGen/RegisterEncoder.td35
-rw-r--r--test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll2
-rw-r--r--test/Transforms/CodeExtractor/MultipleExitBranchProb.ll2
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineAnd.ll4
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll41
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineHighCost.ll107
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineOr.ll4
-rw-r--r--test/Transforms/CodeExtractor/PartialInlineOrAnd.ll4
-rw-r--r--test/Transforms/CodeExtractor/SingleCondition.ll4
-rw-r--r--test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll4
-rw-r--r--test/Transforms/CodeGenPrepare/section-samplepgo.ll57
-rw-r--r--test/Transforms/CodeGenPrepare/section.ll20
-rw-r--r--test/Transforms/ConstProp/calls-math-finite.ll83
-rw-r--r--test/Transforms/ConstProp/calls.ll206
-rw-r--r--test/Transforms/ConstProp/sse.ll208
-rw-r--r--test/Transforms/Coroutines/coro-eh-aware-edge-split.ll218
-rw-r--r--test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll7
-rw-r--r--test/Transforms/GVN/PRE/nonintegral.ll39
-rw-r--r--test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll2
-rw-r--r--test/Transforms/InferFunctionAttrs/annotate.ll126
-rw-r--r--test/Transforms/InferFunctionAttrs/no-proto.ll126
-rw-r--r--test/Transforms/Inline/inline-cold.ll20
-rw-r--r--test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll2
-rw-r--r--test/Transforms/Inline/partial-inline-act.ll2
-rw-r--r--test/Transforms/Inline/prof-update.ll35
-rw-r--r--test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll (renamed from test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll)68
-rw-r--r--test/Transforms/InstCombine/AArch64/lit.local.cfg2
-rw-r--r--test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll (renamed from test/Transforms/InstCombine/amdgcn-intrinsics.ll)0
-rw-r--r--test/Transforms/InstCombine/AMDGPU/lit.local.cfg2
-rw-r--r--test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll65
-rw-r--r--test/Transforms/InstCombine/ARM/constant-fold-hang.ll (renamed from test/Transforms/InstCombine/constant-fold-hang.ll)0
-rw-r--r--test/Transforms/InstCombine/ARM/lit.local.cfg2
-rw-r--r--test/Transforms/InstCombine/ARM/neon-intrinsics.ll (renamed from test/Transforms/InstCombine/neon-intrinsics.ll)0
-rw-r--r--test/Transforms/InstCombine/PowerPC/aligned-altivec.ll (renamed from test/Transforms/InstCombine/aligned-altivec.ll)0
-rw-r--r--test/Transforms/InstCombine/PowerPC/aligned-qpx.ll (renamed from test/Transforms/InstCombine/aligned-qpx.ll)0
-rw-r--r--test/Transforms/InstCombine/PowerPC/lit.local.cfg3
-rw-r--r--test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll (renamed from test/Transforms/InstCombine/vsx-unaligned.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll (renamed from test/Transforms/InstCombine/X86FsubCmpCombine.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/blend_x86.ll (renamed from test/Transforms/InstCombine/blend_x86.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/lit.local.cfg2
-rw-r--r--test/Transforms/InstCombine/X86/pr2645-1.ll (renamed from test/Transforms/InstCombine/pr2645-1.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/shufflemask-undef.ll (renamed from test/Transforms/InstCombine/shufflemask-undef.ll)3
-rw-r--r--test/Transforms/InstCombine/X86/x86-avx2.ll (renamed from test/Transforms/InstCombine/x86-avx2.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-avx512.ll (renamed from test/Transforms/InstCombine/x86-avx512.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-crc32-demanded.ll (renamed from test/Transforms/InstCombine/x86-crc32-demanded.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-f16c.ll (renamed from test/Transforms/InstCombine/x86-f16c.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-fma.ll (renamed from test/Transforms/InstCombine/x86-fma.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-insertps.ll (renamed from test/Transforms/InstCombine/x86-insertps.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-masked-memops.ll (renamed from test/Transforms/InstCombine/x86-masked-memops.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-movmsk.ll (renamed from test/Transforms/InstCombine/x86-movmsk.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-muldq.ll (renamed from test/Transforms/InstCombine/x86-muldq.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-pack.ll (renamed from test/Transforms/InstCombine/x86-pack.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-pshufb.ll (renamed from test/Transforms/InstCombine/x86-pshufb.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-sse.ll (renamed from test/Transforms/InstCombine/x86-sse.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-sse2.ll (renamed from test/Transforms/InstCombine/x86-sse2.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-sse41.ll (renamed from test/Transforms/InstCombine/x86-sse41.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-sse4a.ll (renamed from test/Transforms/InstCombine/x86-sse4a.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll110
-rw-r--r--test/Transforms/InstCombine/X86/x86-vector-shifts.ll (renamed from test/Transforms/InstCombine/x86-vector-shifts.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-vperm2.ll (renamed from test/Transforms/InstCombine/x86-vperm2.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-vpermil.ll (renamed from test/Transforms/InstCombine/x86-vpermil.ll)0
-rw-r--r--test/Transforms/InstCombine/X86/x86-xop.ll (renamed from test/Transforms/InstCombine/x86-xop.ll)0
-rw-r--r--test/Transforms/InstCombine/add.ll26
-rw-r--r--test/Transforms/InstCombine/and.ll2
-rw-r--r--test/Transforms/InstCombine/bit-tracking.ll26
-rw-r--r--test/Transforms/InstCombine/cast.ll38
-rw-r--r--test/Transforms/InstCombine/constant-fold-iteration.ll10
-rw-r--r--test/Transforms/InstCombine/demorgan.ll8
-rw-r--r--test/Transforms/InstCombine/icmp.ll15
-rw-r--r--test/Transforms/InstCombine/intrinsics.ll29
-rw-r--r--test/Transforms/InstCombine/logical-select.ll75
-rw-r--r--test/Transforms/InstCombine/not.ll76
-rw-r--r--test/Transforms/InstCombine/or-xor.ll70
-rw-r--r--test/Transforms/InstCombine/or.ll109
-rw-r--r--test/Transforms/InstCombine/sext.ll2
-rw-r--r--test/Transforms/InstCombine/trunc.ll2
-rw-r--r--test/Transforms/InstCombine/vec_demanded_elts.ll108
-rw-r--r--test/Transforms/InstCombine/xor2.ll11
-rw-r--r--test/Transforms/InstNamer/basic.ll19
-rw-r--r--test/Transforms/InstSimplify/AndOrXor.ll173
-rw-r--r--test/Transforms/InstSimplify/apint-or.ll72
-rw-r--r--test/Transforms/InstSimplify/compare.ll7
-rw-r--r--test/Transforms/InstSimplify/or.ll181
-rw-r--r--test/Transforms/LoopIdiom/ARM/ctlz.ll185
-rw-r--r--test/Transforms/LoopIdiom/X86/ctlz.ll185
-rw-r--r--test/Transforms/LoopUnroll/not-rotated.ll2
-rw-r--r--test/Transforms/LoopVectorize/X86/svml-calls-finite.ll187
-rw-r--r--test/Transforms/LoopVectorize/induction.ll45
-rw-r--r--test/Transforms/LoopVectorize/pr32859.ll30
-rw-r--r--test/Transforms/NewGVN/pr32934.ll69
-rw-r--r--test/Transforms/NewGVN/pr32952.ll42
-rw-r--r--test/Transforms/NewGVN/verify-memoryphi.ll29
-rw-r--r--test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll22
-rw-r--r--test/Transforms/SLPVectorizer/AArch64/getelementptr.ll43
-rw-r--r--test/Transforms/SLPVectorizer/AArch64/horizontal.ll33
-rw-r--r--test/Transforms/SLPVectorizer/AArch64/remarks.ll32
-rw-r--r--test/Transforms/SLPVectorizer/X86/arith-add.ll649
-rw-r--r--test/Transforms/SLPVectorizer/X86/arith-mul.ll700
-rw-r--r--test/Transforms/SLPVectorizer/X86/arith-sub.ll649
-rw-r--r--test/Transforms/SLPVectorizer/X86/shift-ashr.ll913
-rw-r--r--test/Transforms/SLPVectorizer/X86/shift-lshr.ll862
-rw-r--r--test/Transforms/SLPVectorizer/X86/shift-shl.ll814
-rw-r--r--test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll199
-rw-r--r--test/Transforms/SpeculativeExecution/spec-other.ll32
-rw-r--r--test/Transforms/SpeculativeExecution/spec-vector.ll73
-rw-r--r--test/Transforms/Util/split-bit-piece.ll110
-rw-r--r--test/Verifier/metadata-function-dbg.ll16
-rw-r--r--test/tools/llvm-pdbdump/Inputs/FilterTest.cpp18
-rw-r--r--test/tools/llvm-pdbdump/Inputs/FilterTest.pdbbin44032 -> 44032 bytes
-rw-r--r--test/tools/llvm-pdbdump/regex-filter.test8
-rw-r--r--test/tools/llvm-pdbdump/symbol-filters.test74
-rw-r--r--test/tools/llvm-profdata/sample-profile-basic.test7
-rw-r--r--test/tools/llvm-readobj/wasm-invalid.test7
-rw-r--r--tools/bugpoint/ExtractFunction.cpp3
-rw-r--r--tools/llc/llc.cpp2
-rw-r--r--tools/lli/RemoteJITUtils.h5
-rw-r--r--tools/llvm-ar/llvm-ar.cpp2
-rw-r--r--tools/llvm-pdbdump/LLVMOutputStyle.cpp2
-rw-r--r--tools/llvm-pdbdump/PrettyCompilandDumper.cpp12
-rw-r--r--tools/llvm-pdbdump/PrettyFunctionDumper.cpp10
-rw-r--r--tools/llvm-pdbdump/llvm-pdbdump.cpp102
-rw-r--r--tools/llvm-pdbdump/llvm-pdbdump.h23
-rw-r--r--tools/llvm-readobj/COFFDumper.cpp8
-rw-r--r--tools/llvm-readobj/llvm-readobj.cpp21
-rw-r--r--tools/llvm-rtdyld/llvm-rtdyld.cpp3
-rw-r--r--tools/obj2yaml/wasm2yaml.cpp63
-rw-r--r--tools/opt/opt.cpp4
-rw-r--r--tools/yaml2obj/yaml2wasm.cpp11
-rw-r--r--unittests/Analysis/ProfileSummaryInfoTest.cpp6
-rw-r--r--unittests/Analysis/TargetLibraryInfoTest.cpp46
-rw-r--r--unittests/DebugInfo/CMakeLists.txt2
-rw-r--r--unittests/DebugInfo/CodeView/CMakeLists.txt11
-rw-r--r--unittests/DebugInfo/CodeView/ErrorChecking.h61
-rw-r--r--unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp353
-rw-r--r--unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp2
-rw-r--r--unittests/ExecutionEngine/Orc/OrcTestCommon.h2
-rw-r--r--unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp29
-rw-r--r--unittests/IR/ConstantRangeTest.cpp9
-rw-r--r--unittests/IR/InstructionsTest.cpp7
-rw-r--r--unittests/IR/TypeBuilderTest.cpp30
-rw-r--r--unittests/Support/CMakeLists.txt1
-rw-r--r--unittests/Support/DynamicLibrary/DynamicLibraryTest.cpp4
-rw-r--r--unittests/Support/ParallelTest.cpp53
-rw-r--r--unittests/Support/Path.cpp2
-rw-r--r--unittests/Transforms/Utils/Cloning.cpp65
-rw-r--r--utils/TableGen/CodeGenInstruction.cpp1
-rw-r--r--utils/TableGen/SubtargetEmitter.cpp2
-rw-r--r--utils/TableGen/X86RecognizableInstr.cpp125
-rw-r--r--utils/TableGen/X86RecognizableInstr.h122
-rwxr-xr-xutils/git-svn/git-llvm33
-rwxr-xr-xutils/release/build_llvm_package.bat8
-rw-r--r--utils/vscode/README18
-rw-r--r--utils/vscode/tablegen/.vscode/launch.json13
-rw-r--r--utils/vscode/tablegen/CHANGELOG.md4
-rw-r--r--utils/vscode/tablegen/README.md13
-rw-r--r--utils/vscode/tablegen/language-configuration.json30
-rw-r--r--utils/vscode/tablegen/package.json26
-rw-r--r--utils/vscode/tablegen/syntaxes/TableGen.tmLanguage132
-rw-r--r--utils/vscode/tablegen/vsc-extension-quickstart.md27
869 files changed, 53816 insertions, 18688 deletions
diff --git a/CREDITS.TXT b/CREDITS.TXT
index 15d822a68091..20bd553ae2bc 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -265,7 +265,7 @@ D: Release manager (1.7+)
N: Sylvestre Ledru
E: sylvestre@debian.org
W: http://sylvestre.ledru.info/
-W: http://llvm.org/apt/
+W: http://apt.llvm.org/
D: Debian and Ubuntu packaging
D: Continuous integration with jenkins
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 0331d0fa10ab..de8e9bf9a494 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -530,16 +530,6 @@ else()
message(STATUS "Doxygen disabled.")
endif()
-if (LLVM_ENABLE_SPHINX)
- message(STATUS "Sphinx enabled.")
- find_package(Sphinx REQUIRED)
- if (LLVM_BUILD_DOCS)
- add_custom_target(sphinx ALL)
- endif()
-else()
- message(STATUS "Sphinx disabled.")
-endif()
-
set(LLVM_BINDINGS "")
if(WIN32)
message(STATUS "Go bindings disabled.")
diff --git a/cmake/modules/AddSphinxTarget.cmake b/cmake/modules/AddSphinxTarget.cmake
index cfc7f38e9e77..4540c5c36c8e 100644
--- a/cmake/modules/AddSphinxTarget.cmake
+++ b/cmake/modules/AddSphinxTarget.cmake
@@ -1,3 +1,16 @@
+
+# Create sphinx target
+if (LLVM_ENABLE_SPHINX)
+ message(STATUS "Sphinx enabled.")
+ find_package(Sphinx REQUIRED)
+ if (LLVM_BUILD_DOCS AND NOT TARGET sphinx)
+ add_custom_target(sphinx ALL)
+ endif()
+else()
+ message(STATUS "Sphinx disabled.")
+endif()
+
+
# Handy function for creating the different Sphinx targets.
#
# ``builder`` should be one of the supported builders used by
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index 6dff219ae37f..4437610146c4 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -103,8 +103,8 @@ endif()
endif()
if (LLVM_ENABLE_SPHINX)
+ include(AddSphinxTarget)
if (SPHINX_FOUND)
- include(AddSphinxTarget)
if (${SPHINX_OUTPUT_HTML})
add_sphinx_target(html llvm)
endif()
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index d5c8ba4b8214..0cb415ad764e 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -699,14 +699,14 @@ For developers to work with a git monorepo
.. note::
- This set-up is using unofficial mirror hosted on GitHub, use with caution.
+ This set-up is using an unofficial mirror hosted on GitHub, use with caution.
To set up a clone of all the llvm projects using a unified repository:
.. code-block:: console
% export TOP_LEVEL_DIR=`pwd`
- % git clone https://github.com/llvm-project/llvm-project/
+ % git clone https://github.com/llvm-project/llvm-project-20170507/ llvm-project
% cd llvm-project
% git config branch.master.rebase true
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index dad99e3352dd..9ff47e8366dc 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -641,8 +641,9 @@ assume that the globals are densely packed in their section and try to
iterate over them as an array, alignment padding would break this
iteration. The maximum alignment is ``1 << 29``.
-Globals can also have a :ref:`DLL storage class <dllstorageclass>` and
-an optional list of attached :ref:`metadata <metadata>`,
+Globals can also have a :ref:`DLL storage class <dllstorageclass>`,
+an optional :ref:`global attributes <glattrs>` and
+an optional list of attached :ref:`metadata <metadata>`.
Variables and aliases can have a
:ref:`Thread Local Storage Model <tls_model>`.
@@ -1624,6 +1625,14 @@ example:
the ELF x86-64 abi, but it can be disabled for some compilation
units.
+.. _glattrs:
+
+Global Attributes
+-----------------
+
+Attributes may be set to communicate additional information about a global variable.
+Unlike :ref:`function attributes <fnattrs>`, attributes on a global variable
+are grouped into a single :ref:`attribute group <attrgrp>`.
.. _opbundles:
@@ -3664,6 +3673,9 @@ Sparc:
- ``I``: An immediate 13-bit signed integer.
- ``r``: A 32-bit integer register.
+- ``f``: Any floating-point register on SparcV8, or a floating point
+ register in the "low" half of the registers on SparcV9.
+- ``e``: Any floating point register. (Same as ``f`` on SparcV8.)
SystemZ:
@@ -11687,6 +11699,338 @@ Examples:
%r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c
+
+Experimental Vector Reduction Intrinsics
+----------------------------------------
+
+Horizontal reductions of vectors can be expressed using the following
+intrinsics. Each one takes a vector operand as an input and applies its
+respective operation across all elements of the vector, returning a single
+scalar result of the same element type.
+
+
+'``llvm.experimental.vector.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a)
+ declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.add.*``' intrinsics do an integer ``ADD``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a)
+ declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating point
+``ADD`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has fast-math flags, then the reduction will not preserve
+the associativity of an equivalent scalarized counterpart. If it does not have
+fast-math flags, then the reduction will be *ordered*, implying that the
+operation respects the associativity of a scalarized reduction.
+
+
+Arguments:
+""""""""""
+The first argument to this intrinsic is a scalar accumulator value, which is
+only used when there are no fast-math flags attached. This argument may be undef
+when fast-math flags are used.
+
+The second argument must be a vector of floating point values.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+ %fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
+ %ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
+
+
+'``llvm.experimental.vector.reduce.mul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
+ declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.mul.*``' intrinsics do an integer ``MUL``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a)
+ declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating point
+``MUL`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has fast-math flags, then the reduction will not preserve
+the associativity of an equivalent scalarized counterpart. If it does not have
+fast-math flags, then the reduction will be *ordered*, implying that the
+operation respects the associativity of a scalarized reduction.
+
+
+Arguments:
+""""""""""
+The first argument to this intrinsic is a scalar accumulator value, which is
+only used when there are no fast-math flags attached. This argument may be undef
+when fast-math flags are used.
+
+The second argument must be a vector of floating point values.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+ %fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
+ %ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction
+
+'``llvm.experimental.vector.reduce.and.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.and.*``' intrinsics do a bitwise ``AND``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.or.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction
+of a vector, returning the result as a scalar. The return type matches the
+element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.xor.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR``
+reduction of a vector, returning the result as a scalar. The return type matches
+the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.smax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.smax.*``' intrinsics do a signed integer
+``MAX`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.smin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.smin.*``' intrinsics do a signed integer
+``MIN`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.umax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.umax.*``' intrinsics do an unsigned
+integer ``MAX`` reduction of a vector, returning the result as a scalar. The
+return type matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.umin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.umin.*``' intrinsics do an unsigned
+integer ``MIN`` reduction of a vector, returning the result as a scalar. The
+return type matches the element-type of the vector input.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of integer values.
+
+'``llvm.experimental.vector.reduce.fmax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %a)
+ declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating point
+``MAX`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has the ``nnan`` fast-math flag then the operation can
+assume that NaNs are not present in the input vector.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of floating point values.
+
+'``llvm.experimental.vector.reduce.fmin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+ declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %a)
+ declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating point
+``MIN`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+If the intrinsic call has the ``nnan`` fast-math flag then the operation can
+assume that NaNs are not present in the input vector.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of floating point values.
+
Half Precision Floating Point Intrinsics
----------------------------------------
diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst
index 35687e258182..ebc3fb772e81 100644
--- a/docs/Lexicon.rst
+++ b/docs/Lexicon.rst
@@ -249,6 +249,14 @@ S
Superword-Level Parallelism, same as :ref:`Basic-Block Vectorization
<lexicon-bb-vectorization>`.
+**Splat**
+ Splat refers to a vector of identical scalar elements.
+
+ The term is based on the PowerPC Altivec instructions that provided
+ this functionality in hardware. For example, "vsplth" and the corresponding
+ software intrinsic "vec_splat()". Examples of other hardware names for this
+ action include "duplicate" (ARM) and "broadcast" (x86).
+
**SRoA**
Scalar Replacement of Aggregates
diff --git a/docs/LibFuzzer.rst b/docs/LibFuzzer.rst
index a11baa720ec8..5acfa04ce1f4 100644
--- a/docs/LibFuzzer.rst
+++ b/docs/LibFuzzer.rst
@@ -305,6 +305,10 @@ The most important command line options are:
- 1 : close ``stdout``
- 2 : close ``stderr``
- 3 : close both ``stdout`` and ``stderr``.
+``-print_coverage``
+ If 1, print coverage information as text at exit.
+``-dump_coverage``
+ If 1, dump coverage information as a .sancov file at exit.
For the full list of flags run the fuzzer binary with ``-help=1``.
@@ -543,12 +547,19 @@ You can get the coverage for your corpus like this:
.. code-block:: console
- ASAN_OPTIONS=coverage=1 ./fuzzer CORPUS_DIR -runs=0
+ ./fuzzer CORPUS_DIR -runs=0 -print_coverage=1
This will run all tests in the CORPUS_DIR but will not perform any fuzzing.
-At the end of the process it will dump a single ``.sancov`` file with coverage
-information. See SanitizerCoverage_ for details on querying the file using the
-``sancov`` tool.
+At the end of the process it will print text describing what code has been covered and what hasn't.
+
+Alternatively, use
+
+.. code-block:: console
+
+ ./fuzzer CORPUS_DIR -runs=0 -dump_coverage=1
+
+which will dump a ``.sancov`` file with coverage information.
+See SanitizerCoverage_ for details on querying the file using the ``sancov`` tool.
You may also use other ways to visualize coverage,
e.g. using `Clang coverage <http://clang.llvm.org/docs/SourceBasedCodeCoverage.html>`_,
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index dbffb53d5a51..bc35e62189a2 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -40,6 +40,10 @@ Non-comprehensive list of changes in this release
functionality, or simply have a lot to talk about), see the `NOTE` below
for adding a new subsection.
+* LLVM's ``WeakVH`` has been renamed to ``WeakTrackingVH`` and a new ``WeakVH``
+ has been introduced. The new ``WeakVH`` nulls itself out on deletion, but
+ does not track values across RAUW.
+
* ... next change ...
.. NOTE
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index c3822e35906a..94fbd1a29bf9 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -157,6 +157,11 @@ private:
return isSingleWord() ? U.VAL : U.pVal[whichWord(bitPosition)];
}
+ /// Utility method to change the bit width of this APInt to new bit width,
+ /// allocating and/or deallocating as necessary. There is no guarantee on the
+ /// value of any bits upon return. Caller should populate the bits after.
+ void reallocate(unsigned NewBitWidth);
+
/// \brief Convert a char array into an APInt
///
/// \param radix 2, 8, 10, 16, or 36
@@ -1437,6 +1442,12 @@ public:
/// as "bitPosition".
void flipBit(unsigned bitPosition);
+ /// Negate this APInt in place.
+ void negate() {
+ flipAllBits();
+ ++(*this);
+ }
+
/// Insert the bits from a smaller APInt starting at bitPosition.
void insertBits(const APInt &SubBits, unsigned bitPosition);
@@ -1646,12 +1657,7 @@ public:
/// re-interprets the bits as a double. Note that it is valid to do this on
/// any bit width. Exactly 64 bits will be translated.
double bitsToDouble() const {
- union {
- uint64_t I;
- double D;
- } T;
- T.I = (isSingleWord() ? U.VAL : U.pVal[0]);
- return T.D;
+ return BitsToDouble(getWord(0));
}
/// \brief Converts APInt bits to a double
@@ -1660,12 +1666,7 @@ public:
/// re-interprets the bits as a float. Note that it is valid to do this on
/// any bit width. Exactly 32 bits will be translated.
float bitsToFloat() const {
- union {
- unsigned I;
- float F;
- } T;
- T.I = unsigned((isSingleWord() ? U.VAL : U.pVal[0]));
- return T.F;
+ return BitsToFloat(getWord(0));
}
/// \brief Converts a double to APInt bits.
@@ -1673,12 +1674,7 @@ public:
/// The conversion does not do a translation from double to integer, it just
/// re-interprets the bits of the double.
static APInt doubleToBits(double V) {
- union {
- uint64_t I;
- double D;
- } T;
- T.D = V;
- return APInt(sizeof T * CHAR_BIT, T.I);
+ return APInt(sizeof(double) * CHAR_BIT, DoubleToBits(V));
}
/// \brief Converts a float to APInt bits.
@@ -1686,12 +1682,7 @@ public:
/// The conversion does not do a translation from float to integer, it just
/// re-interprets the bits of the float.
static APInt floatToBits(float V) {
- union {
- unsigned I;
- float F;
- } T;
- T.F = V;
- return APInt(sizeof T * CHAR_BIT, T.I);
+ return APInt(sizeof(float) * CHAR_BIT, FloatToBits(V));
}
/// @}
@@ -1852,10 +1843,9 @@ public:
unsigned);
/// DST = LHS * RHS, where DST has width the sum of the widths of the
- /// operands. No overflow occurs. DST must be disjoint from both
- /// operands. Returns the number of parts required to hold the result.
- static unsigned tcFullMultiply(WordType *, const WordType *,
- const WordType *, unsigned, unsigned);
+ /// operands. No overflow occurs. DST must be disjoint from both operands.
+ static void tcFullMultiply(WordType *, const WordType *,
+ const WordType *, unsigned, unsigned);
/// If RHS is zero LHS and REMAINDER are left unchanged, return one.
/// Otherwise set LHS to LHS / RHS with the fractional part discarded, set
@@ -1997,8 +1987,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const APInt &I) {
}
inline APInt operator-(APInt v) {
- v.flipAllBits();
- ++v;
+ v.negate();
return v;
}
diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index e835f1516225..4a2af7cd68a6 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h
@@ -255,7 +255,7 @@ public:
/// find_prev - Returns the index of the first set bit that precedes the
/// the bit at \p PriorTo. Returns -1 if all previous bits are unset.
- int find_prev(unsigned PriorTo) {
+ int find_prev(unsigned PriorTo) const {
if (PriorTo == 0)
return -1;
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 15945adbe589..8c28412bb607 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -706,6 +706,18 @@ struct is_one_of<T, U, Ts...> {
std::is_same<T, U>::value || is_one_of<T, Ts...>::value;
};
+/// \brief traits class for checking whether type T is a base class for all
+/// the given types in the variadic list.
+template <typename T, typename... Ts> struct are_base_of {
+ static const bool value = true;
+};
+
+template <typename T, typename U, typename... Ts>
+struct are_base_of<T, U, Ts...> {
+ static const bool value =
+ std::is_base_of<T, U>::value && are_base_of<T, Ts...>::value;
+};
+
//===----------------------------------------------------------------------===//
// Extra additions for arrays
//===----------------------------------------------------------------------===//
@@ -1079,7 +1091,7 @@ private:
///
/// std::vector<char> Items = {'A', 'B', 'C', 'D'};
/// for (auto X : enumerate(Items)) {
-/// printf("Item %d - %c\n", X.Index, X.Value);
+/// printf("Item %d - %c\n", X.index(), X.value());
/// }
///
/// Output:
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 26f11924b771..1c109be3bab3 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -106,6 +106,13 @@ static inline std::string fromHex(StringRef Input) {
return Output;
}
+/// \brief Convert the string \p S to an integer of the specified type using
+/// the radix \p Base. If \p Base is 0, auto-detects the radix.
+/// Returns true if the number was successfully converted, false otherwise.
+template <typename N> bool to_integer(StringRef S, N &Num, unsigned Base = 0) {
+ return !S.getAsInteger(Base, Num);
+}
+
static inline std::string utostr(uint64_t X, bool isNeg = false) {
char Buffer[21];
char *BufPtr = std::end(Buffer);
diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h
index cc4788d3edae..01469a25c96c 100644
--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h
@@ -41,12 +41,6 @@
/// of all of the caller-callee relationships, which is useful for
/// transformations.
///
-/// The CallGraph class also attempts to figure out what the root of the
-/// CallGraph is, which it currently does by looking for a function named
-/// 'main'. If no function named 'main' is found, the external node is used as
-/// the entry node, reflecting the fact that any function without internal
-/// linkage could be called into (which is common for libraries).
-///
//===----------------------------------------------------------------------===//
#ifndef LLVM_ANALYSIS_CALLGRAPH_H
@@ -82,10 +76,6 @@ class CallGraph {
/// \brief A map from \c Function* to \c CallGraphNode*.
FunctionMapTy FunctionMap;
- /// \brief Root is root of the call graph, or the external node if a 'main'
- /// function couldn't be found.
- CallGraphNode *Root;
-
/// \brief This node has edges to all external functions and those internal
/// functions that have their address taken.
CallGraphNode *ExternalCallingNode;
diff --git a/include/llvm/Analysis/ProfileSummaryInfo.h b/include/llvm/Analysis/ProfileSummaryInfo.h
index 75c4cbd03706..c5f97083af4d 100644
--- a/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -67,8 +67,8 @@ public:
}
/// Returns the profile count for \p CallInst.
- static Optional<uint64_t> getProfileCount(const Instruction *CallInst,
- BlockFrequencyInfo *BFI);
+ Optional<uint64_t> getProfileCount(const Instruction *CallInst,
+ BlockFrequencyInfo *BFI);
/// \brief Returns true if \p F has hot function entry.
bool isFunctionEntryHot(const Function *F);
/// Returns true if \p F has hot function entry or hot call edge.
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 85350fa159d6..ceca6cb389a1 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -568,27 +568,16 @@ private:
Predicates.insert(P);
}
- /*implicit*/ ExitLimit(const SCEV *E)
- : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {}
+ /*implicit*/ ExitLimit(const SCEV *E);
ExitLimit(
const SCEV *E, const SCEV *M, bool MaxOrZero,
- ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList)
- : ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) {
- assert((isa<SCEVCouldNotCompute>(ExactNotTaken) ||
- !isa<SCEVCouldNotCompute>(MaxNotTaken)) &&
- "Exact is not allowed to be less precise than Max");
- for (auto *PredSet : PredSetList)
- for (auto *P : *PredSet)
- addPredicate(P);
- }
+ ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList);
ExitLimit(const SCEV *E, const SCEV *M, bool MaxOrZero,
- const SmallPtrSetImpl<const SCEVPredicate *> &PredSet)
- : ExitLimit(E, M, MaxOrZero, {&PredSet}) {}
+ const SmallPtrSetImpl<const SCEVPredicate *> &PredSet);
- ExitLimit(const SCEV *E, const SCEV *M, bool MaxOrZero)
- : ExitLimit(E, M, MaxOrZero, None) {}
+ ExitLimit(const SCEV *E, const SCEV *M, bool MaxOrZero);
/// Test whether this ExitLimit contains any computed information, or
/// whether it's all SCEVCouldNotCompute values.
@@ -782,7 +771,7 @@ private:
/// Set the memoized range for the given SCEV.
const ConstantRange &setRange(const SCEV *S, RangeSignHint Hint,
- ConstantRange &&CR) {
+ ConstantRange CR) {
DenseMap<const SCEV *, ConstantRange> &Cache =
Hint == HINT_RANGE_UNSIGNED ? UnsignedRanges : SignedRanges;
diff --git a/include/llvm/Analysis/TargetLibraryInfo.def b/include/llvm/Analysis/TargetLibraryInfo.def
index 099a3c7cf2ac..9cbe917c146d 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/include/llvm/Analysis/TargetLibraryInfo.def
@@ -161,6 +161,60 @@ TLI_DEFINE_STRING_INTERNAL("_Znwm")
/// void *new(unsigned long, nothrow);
TLI_DEFINE_ENUM_INTERNAL(ZnwmRKSt9nothrow_t)
TLI_DEFINE_STRING_INTERNAL("_ZnwmRKSt9nothrow_t")
+/// double __acos_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(acos_finite)
+TLI_DEFINE_STRING_INTERNAL("__acos_finite")
+/// float __acosf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(acosf_finite)
+TLI_DEFINE_STRING_INTERNAL("__acosf_finite")
+/// double __acosh_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(acosh_finite)
+TLI_DEFINE_STRING_INTERNAL("__acosh_finite")
+/// float __acoshf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(acoshf_finite)
+TLI_DEFINE_STRING_INTERNAL("__acoshf_finite")
+/// long double __acoshl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(acoshl_finite)
+TLI_DEFINE_STRING_INTERNAL("__acoshl_finite")
+/// long double __acosl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(acosl_finite)
+TLI_DEFINE_STRING_INTERNAL("__acosl_finite")
+/// double __asin_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(asin_finite)
+TLI_DEFINE_STRING_INTERNAL("__asin_finite")
+/// float __asinf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(asinf_finite)
+TLI_DEFINE_STRING_INTERNAL("__asinf_finite")
+/// long double __asinl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(asinl_finite)
+TLI_DEFINE_STRING_INTERNAL("__asinl_finite")
+/// double atan2_finite(double y, double x);
+TLI_DEFINE_ENUM_INTERNAL(atan2_finite)
+TLI_DEFINE_STRING_INTERNAL("__atan2_finite")
+/// float atan2f_finite(float y, float x);
+TLI_DEFINE_ENUM_INTERNAL(atan2f_finite)
+TLI_DEFINE_STRING_INTERNAL("__atan2f_finite")
+/// long double atan2l_finite(long double y, long double x);
+TLI_DEFINE_ENUM_INTERNAL(atan2l_finite)
+TLI_DEFINE_STRING_INTERNAL("__atan2l_finite")
+/// double __atanh_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(atanh_finite)
+TLI_DEFINE_STRING_INTERNAL("__atanh_finite")
+/// float __atanhf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(atanhf_finite)
+TLI_DEFINE_STRING_INTERNAL("__atanhf_finite")
+/// long double __atanhl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(atanhl_finite)
+TLI_DEFINE_STRING_INTERNAL("__atanhl_finite")
+/// double __cosh_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(cosh_finite)
+TLI_DEFINE_STRING_INTERNAL("__cosh_finite")
+/// float __coshf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(coshf_finite)
+TLI_DEFINE_STRING_INTERNAL("__coshf_finite")
+/// long double __coshl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(coshl_finite)
+TLI_DEFINE_STRING_INTERNAL("__coshl_finite")
/// double __cospi(double x);
TLI_DEFINE_ENUM_INTERNAL(cospi)
TLI_DEFINE_STRING_INTERNAL("__cospi")
@@ -180,12 +234,66 @@ TLI_DEFINE_STRING_INTERNAL("__cxa_guard_acquire")
/// void __cxa_guard_release(guard_t *guard);
TLI_DEFINE_ENUM_INTERNAL(cxa_guard_release)
TLI_DEFINE_STRING_INTERNAL("__cxa_guard_release")
+/// double __exp10_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(exp10_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp10_finite")
+/// float __exp10f_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(exp10f_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp10f_finite")
+/// long double __exp10l_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(exp10l_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp10l_finite")
+/// double __exp2_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(exp2_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp2_finite")
+/// float __exp2f_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(exp2f_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp2f_finite")
+/// long double __exp2l_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(exp2l_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp2l_finite")
+/// double __exp_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(exp_finite)
+TLI_DEFINE_STRING_INTERNAL("__exp_finite")
+/// float __expf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(expf_finite)
+TLI_DEFINE_STRING_INTERNAL("__expf_finite")
+/// long double __expl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(expl_finite)
+TLI_DEFINE_STRING_INTERNAL("__expl_finite")
/// int __isoc99_scanf (const char *format, ...)
TLI_DEFINE_ENUM_INTERNAL(dunder_isoc99_scanf)
TLI_DEFINE_STRING_INTERNAL("__isoc99_scanf")
/// int __isoc99_sscanf(const char *s, const char *format, ...)
TLI_DEFINE_ENUM_INTERNAL(dunder_isoc99_sscanf)
TLI_DEFINE_STRING_INTERNAL("__isoc99_sscanf")
+/// double __log10_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(log10_finite)
+TLI_DEFINE_STRING_INTERNAL("__log10_finite")
+/// float __log10f_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(log10f_finite)
+TLI_DEFINE_STRING_INTERNAL("__log10f_finite")
+/// long double __log10l_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(log10l_finite)
+TLI_DEFINE_STRING_INTERNAL("__log10l_finite")
+/// double __log2_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(log2_finite)
+TLI_DEFINE_STRING_INTERNAL("__log2_finite")
+/// float __log2f_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(log2f_finite)
+TLI_DEFINE_STRING_INTERNAL("__log2f_finite")
+/// long double __log2l_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(log2l_finite)
+TLI_DEFINE_STRING_INTERNAL("__log2l_finite")
+/// double __log_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(log_finite)
+TLI_DEFINE_STRING_INTERNAL("__log_finite")
+/// float __logf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(logf_finite)
+TLI_DEFINE_STRING_INTERNAL("__logf_finite")
+/// long double __logl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(logl_finite)
+TLI_DEFINE_STRING_INTERNAL("__logl_finite")
/// void *__memcpy_chk(void *s1, const void *s2, size_t n, size_t s1size);
TLI_DEFINE_ENUM_INTERNAL(memcpy_chk)
TLI_DEFINE_STRING_INTERNAL("__memcpy_chk")
@@ -199,13 +307,30 @@ TLI_DEFINE_STRING_INTERNAL("__memset_chk")
// int __nvvm_reflect(const char *)
TLI_DEFINE_ENUM_INTERNAL(nvvm_reflect)
TLI_DEFINE_STRING_INTERNAL("__nvvm_reflect")
-
+/// double __pow_finite(double x, double y);
+TLI_DEFINE_ENUM_INTERNAL(pow_finite)
+TLI_DEFINE_STRING_INTERNAL("__pow_finite")
+/// float _powf_finite(float x, float y);
+TLI_DEFINE_ENUM_INTERNAL(powf_finite)
+TLI_DEFINE_STRING_INTERNAL("__powf_finite")
+/// long double __powl_finite(long double x, long double y);
+TLI_DEFINE_ENUM_INTERNAL(powl_finite)
+TLI_DEFINE_STRING_INTERNAL("__powl_finite")
/// double __sincospi_stret(double x);
TLI_DEFINE_ENUM_INTERNAL(sincospi_stret)
TLI_DEFINE_STRING_INTERNAL("__sincospi_stret")
/// float __sincospif_stret(float x);
TLI_DEFINE_ENUM_INTERNAL(sincospif_stret)
TLI_DEFINE_STRING_INTERNAL("__sincospif_stret")
+/// double __sinh_finite(double x);
+TLI_DEFINE_ENUM_INTERNAL(sinh_finite)
+TLI_DEFINE_STRING_INTERNAL("__sinh_finite")
+/// float _sinhf_finite(float x);
+TLI_DEFINE_ENUM_INTERNAL(sinhf_finite)
+TLI_DEFINE_STRING_INTERNAL("__sinhf_finite")
+/// long double __sinhl_finite(long double x);
+TLI_DEFINE_ENUM_INTERNAL(sinhl_finite)
+TLI_DEFINE_STRING_INTERNAL("__sinhl_finite")
/// double __sinpi(double x);
TLI_DEFINE_ENUM_INTERNAL(sinpi)
TLI_DEFINE_STRING_INTERNAL("__sinpi")
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index b9639dba1881..0a0af384c3e6 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -537,6 +537,9 @@ public:
/// \return The width of the largest scalar or vector register type.
unsigned getRegisterBitWidth(bool Vector) const;
+ /// \return The width of the smallest vector register type.
+ unsigned getMinVectorRegisterBitWidth() const;
+
/// \return True if it should be considered for address type promotion.
/// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
/// profitable without finding other extensions fed by the same input.
@@ -740,6 +743,22 @@ public:
unsigned ChainSizeInBytes,
VectorType *VecTy) const;
+ /// Flags describing the kind of vector reduction.
+ struct ReductionFlags {
+ ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
+ bool IsMaxOp; ///< If the op a min/max kind, true if it's a max operation.
+ bool IsSigned; ///< Whether the operation is a signed int reduction.
+ bool NoNaN; ///< If op is an fp min/max, whether NaNs may be present.
+ };
+
+ /// \returns True if the target wants to handle the given reduction idiom in
+ /// the intrinsics form instead of the shuffle form.
+ bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ ReductionFlags Flags) const;
+
+ /// \returns True if the target wants to expand the given reduction intrinsic
+ /// into a shuffle sequence.
+ bool shouldExpandReduction(const IntrinsicInst *II) const;
/// @}
private:
@@ -824,6 +843,7 @@ public:
Type *Ty) = 0;
virtual unsigned getNumberOfRegisters(bool Vector) = 0;
virtual unsigned getRegisterBitWidth(bool Vector) = 0;
+ virtual unsigned getMinVectorRegisterBitWidth() = 0;
virtual bool shouldConsiderAddressTypePromotion(
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
virtual unsigned getCacheLineSize() = 0;
@@ -895,6 +915,9 @@ public:
virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const = 0;
+ virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ ReductionFlags) const = 0;
+ virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
};
template <typename T>
@@ -1057,6 +1080,9 @@ public:
unsigned getRegisterBitWidth(bool Vector) override {
return Impl.getRegisterBitWidth(Vector);
}
+ unsigned getMinVectorRegisterBitWidth() override {
+ return Impl.getMinVectorRegisterBitWidth();
+ }
bool shouldConsiderAddressTypePromotion(
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
return Impl.shouldConsiderAddressTypePromotion(
@@ -1200,6 +1226,13 @@ public:
VectorType *VecTy) const override {
return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
}
+ bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ ReductionFlags Flags) const override {
+ return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
+ }
+ bool shouldExpandReduction(const IntrinsicInst *II) const override {
+ return Impl.shouldExpandReduction(II);
+ }
};
template <typename T>
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index d7fda9e14b05..550e84ad90c4 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -311,6 +311,8 @@ public:
unsigned getRegisterBitWidth(bool Vector) { return 32; }
+ unsigned getMinVectorRegisterBitWidth() { return 128; }
+
bool
shouldConsiderAddressTypePromotion(const Instruction &I,
bool &AllowPromotionWithoutCommonHeader) {
@@ -456,6 +458,16 @@ public:
VectorType *VecTy) const {
return VF;
}
+
+ bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ return false;
+ }
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const {
+ return true;
+ }
+
protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index a54c39e3ea3a..f5f323c6b797 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -56,6 +56,11 @@ template <typename T> class ArrayRef;
const Instruction *CxtI = nullptr,
const DominatorTree *DT = nullptr,
OptimizationRemarkEmitter *ORE = nullptr);
+ /// Returns the known bits rather than passing by reference.
+ KnownBits computeKnownBits(const Value *V, const DataLayout &DL,
+ unsigned Depth = 0, AssumptionCache *AC = nullptr,
+ const Instruction *CxtI = nullptr,
+ const DominatorTree *DT = nullptr);
/// Compute known bits from the range metadata.
/// \p KnownZero the set of bits that are known to be zero
/// \p KnownOne the set of bits that are known to be one
@@ -68,14 +73,6 @@ template <typename T> class ArrayRef;
const Instruction *CxtI = nullptr,
const DominatorTree *DT = nullptr);
- /// Determine whether the sign bit is known to be zero or one. Convenience
- /// wrapper around computeKnownBits.
- void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
- const DataLayout &DL, unsigned Depth = 0,
- AssumptionCache *AC = nullptr,
- const Instruction *CxtI = nullptr,
- const DominatorTree *DT = nullptr);
-
/// Return true if the given value is known to have exactly one bit set when
/// defined. For vectors return true if every element is known to be a power
/// of two when defined. Supports values with integer or pointer type and
diff --git a/include/llvm/Bitcode/BitcodeReader.h b/include/llvm/Bitcode/BitcodeReader.h
index 54f990d00233..31ffb7645f3a 100644
--- a/include/llvm/Bitcode/BitcodeReader.h
+++ b/include/llvm/Bitcode/BitcodeReader.h
@@ -152,10 +152,11 @@ namespace llvm {
/// Parse the module summary index out of an IR file and return the module
/// summary index object if found, or an empty summary if not. If Path refers
- /// to an empty file and the -ignore-empty-index-file cl::opt flag is passed
+ /// to an empty file and IgnoreEmptyThinLTOIndexFile is true, then
/// this function will return nullptr.
Expected<std::unique_ptr<ModuleSummaryIndex>>
- getModuleSummaryIndexForFile(StringRef Path);
+ getModuleSummaryIndexForFile(StringRef Path,
+ bool IgnoreEmptyThinLTOIndexFile = false);
/// isBitcodeWrapper - Return true if the given bytes are the magic bytes
/// for an LLVM IR bitcode wrapper.
diff --git a/include/llvm/CodeGen/ExpandReductions.h b/include/llvm/CodeGen/ExpandReductions.h
new file mode 100644
index 000000000000..c6aaaad967b3
--- /dev/null
+++ b/include/llvm/CodeGen/ExpandReductions.h
@@ -0,0 +1,24 @@
+//===----- ExpandReductions.h - Expand experimental reduction intrinsics --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_EXPANDREDUCTIONS_H
+#define LLVM_CODEGEN_EXPANDREDUCTIONS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class ExpandReductionsPass
+ : public PassInfoMixin<ExpandReductionsPass> {
+public:
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_EXPANDREDUCTIONS_H
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 30d67eb49923..21354ae20ed1 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -145,7 +145,7 @@ public:
/// Iterate the given function (typically something like doubling the width)
/// on Ty until we find a legal type for this operation.
- LLT findLegalType(const InstrAspect &Aspect,
+ Optional<LLT> findLegalType(const InstrAspect &Aspect,
function_ref<LLT(LLT)> NextType) const {
LegalizeAction Action;
const TypeMap &Map = Actions[Aspect.Opcode - FirstOp][Aspect.Idx];
@@ -153,8 +153,12 @@ public:
do {
Ty = NextType(Ty);
auto ActionIt = Map.find(Ty);
- if (ActionIt == Map.end())
- Action = DefaultActions.find(Aspect.Opcode)->second;
+ if (ActionIt == Map.end()) {
+ auto DefaultIt = DefaultActions.find(Aspect.Opcode);
+ if (DefaultIt == DefaultActions.end())
+ return None;
+ Action = DefaultIt->second;
+ }
else
Action = ActionIt->second;
} while(Action != Legal);
@@ -163,11 +167,14 @@ public:
/// Find what type it's actually OK to perform the given operation on, given
/// the general approach we've decided to take.
- LLT findLegalType(const InstrAspect &Aspect, LegalizeAction Action) const;
+ Optional<LLT> findLegalType(const InstrAspect &Aspect, LegalizeAction Action) const;
std::pair<LegalizeAction, LLT> findLegalAction(const InstrAspect &Aspect,
LegalizeAction Action) const {
- return std::make_pair(Action, findLegalType(Aspect, Action));
+ auto LegalType = findLegalType(Aspect, Action);
+ if (!LegalType)
+ return std::make_pair(LegalizeAction::Unsupported, LLT());
+ return std::make_pair(Action, *LegalType);
}
/// Find the specified \p Aspect in the primary (explicitly set) Actions
diff --git a/include/llvm/CodeGen/GlobalISel/Utils.h b/include/llvm/CodeGen/GlobalISel/Utils.h
index 92bc9736141a..69d507069808 100644
--- a/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -30,6 +30,7 @@ class TargetInstrInfo;
class TargetPassConfig;
class TargetRegisterInfo;
class Twine;
+class ConstantFP;
/// Try to constrain Reg so that it is usable by argument OpIdx of the
/// provided MCInstrDesc \p II. If this fails, create a new virtual
@@ -62,6 +63,8 @@ void reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
Optional<int64_t> getConstantVRegVal(unsigned VReg,
const MachineRegisterInfo &MRI);
+const ConstantFP* getConstantFPVRegVal(unsigned VReg,
+ const MachineRegisterInfo &MRI);
} // End namespace llvm.
#endif
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index ca0f3fbad892..f2a9a9f73ca6 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -644,6 +644,13 @@ namespace ISD {
/// of a call sequence, and carry arbitrary information that target might
/// want to know. The first operand is a chain, the rest are specified by
/// the target and not touched by the DAG optimizers.
+ /// Targets that may use stack to pass call arguments define additional
+ /// operands:
+ /// - size of the call frame part that must be set up within the
+ /// CALLSEQ_START..CALLSEQ_END pair,
+ /// - part of the call frame prepared prior to CALLSEQ_START.
+ /// Both these parameters must be constants, their sum is the total call
+ /// frame size.
/// CALLSEQ_START..CALLSEQ_END pairs may not be nested.
CALLSEQ_START, // Beginning of a call sequence
CALLSEQ_END, // End of a call sequence
@@ -783,6 +790,20 @@ namespace ISD {
/// known nonzero constant. The only operand here is the chain.
GET_DYNAMIC_AREA_OFFSET,
+ /// Generic reduction nodes. These nodes represent horizontal vector
+ /// reduction operations, producing a scalar result.
+ /// The STRICT variants perform reductions in sequential order. The first
+ /// operand is an initial scalar accumulator value, and the second operand
+ /// is the vector to reduce.
+ VECREDUCE_STRICT_FADD, VECREDUCE_STRICT_FMUL,
+ /// These reductions are non-strict, and have a single vector operand.
+ VECREDUCE_FADD, VECREDUCE_FMUL,
+ VECREDUCE_ADD, VECREDUCE_MUL,
+ VECREDUCE_AND, VECREDUCE_OR, VECREDUCE_XOR,
+ VECREDUCE_SMAX, VECREDUCE_SMIN, VECREDUCE_UMAX, VECREDUCE_UMIN,
+ /// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
+ VECREDUCE_FMAX, VECREDUCE_FMIN,
+
/// BUILTIN_OP_END - This must be the last enum value in this list.
/// The target-specific pre-isel opcode values start here.
BUILTIN_OP_END
diff --git a/include/llvm/CodeGen/MachineCombinerPattern.h b/include/llvm/CodeGen/MachineCombinerPattern.h
index 11238016d447..8c54ae925470 100644
--- a/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -48,6 +48,8 @@ enum class MachineCombinerPattern {
FMULADDD_OP2,
FMULSUBD_OP1,
FMULSUBD_OP2,
+ FNMULSUBS_OP1,
+ FNMULSUBD_OP1,
FMLAv1i32_indexed_OP1,
FMLAv1i32_indexed_OP2,
FMLAv1i64_indexed_OP1,
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 42299b529410..8a5a1997386f 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -68,6 +68,10 @@ namespace llvm {
/// matching during instruction selection.
FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr);
+ /// createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather
+ /// and scatter intrinsics with scalar code when target doesn't support them.
+ FunctionPass *createScalarizeMaskedMemIntrinPass();
+
/// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg
/// load-linked/store-conditional loops.
extern char &AtomicExpandID;
@@ -129,6 +133,10 @@ namespace llvm {
// instruction and update the MachineFunctionInfo with that information.
extern char &ShrinkWrapID;
+ /// LiveRangeShrink pass. Move instruction close to its definition to shrink
+ /// the definition's live range.
+ extern char &LiveRangeShrinkID;
+
/// Greedy register allocator.
extern char &RAGreedyID;
@@ -405,6 +413,10 @@ namespace llvm {
/// printing assembly.
ModulePass *createMachineOutlinerPass();
+ /// This pass expands the experimental reduction intrinsics into sequences of
+ /// shuffles.
+ FunctionPass *createExpandReductionsPass();
+
} // End llvm namespace
/// Target machine pass initializer for passes with dependencies. Use with
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 9e1d148c7ce5..d761661f763e 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -406,7 +406,7 @@ public:
/// certain types of nodes together, or eliminating superfluous nodes. The
/// Level argument controls whether Combine is allowed to produce nodes and
/// types that are illegal on the target.
- void Combine(CombineLevel Level, AliasAnalysis &AA,
+ void Combine(CombineLevel Level, AliasAnalysis *AA,
CodeGenOpt::Level OptLevel);
/// This transforms the SelectionDAG into a SelectionDAG that
@@ -737,11 +737,15 @@ public:
/// \brief Create a logical NOT operation as (XOR Val, BooleanOne).
SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT);
- /// Return a new CALLSEQ_START node, which always must have a glue result
- /// (to ensure it's not CSE'd). CALLSEQ_START does not have a useful SDLoc.
- SDValue getCALLSEQ_START(SDValue Chain, SDValue Op, const SDLoc &DL) {
+ /// Return a new CALLSEQ_START node, that starts new call frame, in which
+ /// InSize bytes are set up inside CALLSEQ_START..CALLSEQ_END sequence and
+ /// OutSize specifies part of the frame set up prior to the sequence.
+ SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize,
+ const SDLoc &DL) {
SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
- SDValue Ops[] = { Chain, Op };
+ SDValue Ops[] = { Chain,
+ getIntPtrConstant(InSize, DL, true),
+ getIntPtrConstant(OutSize, DL, true) };
return getNode(ISD::CALLSEQ_START, DL, VTs, Ops);
}
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index e9012db7602d..f3122f0bf7f0 100644
--- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -26,6 +26,7 @@ public:
void addTypeServerHandler(TypeServerHandler &Handler);
+ Error visitTypeRecord(CVType &Record, TypeIndex Index);
Error visitTypeRecord(CVType &Record);
Error visitMemberRecord(CVMemberRecord &Record);
@@ -37,6 +38,9 @@ public:
Error visitFieldListMemberStream(BinaryStreamReader Reader);
private:
+ Expected<bool> handleTypeServer(CVType &Record);
+ Error finishVisitation(CVType &Record);
+
/// The interface to the class that gets notified of each visitation.
TypeVisitorCallbacks &Callbacks;
diff --git a/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h b/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h
new file mode 100644
index 000000000000..35a8010f1163
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h
@@ -0,0 +1,103 @@
+//===- RandomAccessTypeVisitor.h ------------------------------ *- C++ --*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H
+#define LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H
+
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace codeview {
+
+class TypeDatabase;
+class TypeServerHandler;
+class TypeVisitorCallbacks;
+
+/// \brief Provides amortized O(1) random access to a CodeView type stream.
+/// Normally to access a type from a type stream, you must know its byte
+/// offset into the type stream, because type records are variable-lengthed.
+/// However, this is not the way we prefer to access them. For example, given
+/// a symbol record one of the fields may be the TypeIndex of the symbol's
+/// type record. Or given a type record such as an array type, there might
+/// be a TypeIndex for the element type. Sequential access is perfect when
+/// we're just dumping every entry, but it's very poor for real world usage.
+///
+/// Type streams in PDBs contain an additional field which is a list of pairs
+/// containing indices and their corresponding offsets, roughly every ~8KB of
+/// record data. This general idea need not be confined to PDBs though. By
+/// supplying such an array, the producer of a type stream can allow the
+/// consumer much better access time, because the consumer can find the nearest
+/// index in this array, and do a linear scan forward only from there.
+///
+/// RandomAccessTypeVisitor implements this algorithm, but additionally goes one
+/// step further by caching offsets of every record that has been visited at
+/// least once. This way, even repeated visits of the same record will never
+/// require more than one linear scan. For a type stream of N elements divided
+/// into M chunks of roughly equal size, this yields a worst case lookup time
+/// of O(N/M) and an amortized time of O(1).
+class RandomAccessTypeVisitor {
+ typedef FixedStreamArray<TypeIndexOffset> PartialOffsetArray;
+
+public:
+ RandomAccessTypeVisitor(const CVTypeArray &Types, uint32_t NumRecords,
+ PartialOffsetArray PartialOffsets);
+
+ Error visitTypeIndex(TypeIndex Index, TypeVisitorCallbacks &Callbacks);
+
+ const TypeDatabase &database() const { return Database; }
+
+private:
+ Error visitRangeForType(TypeIndex TI);
+ Error visitRange(TypeIndex Begin, uint32_t BeginOffset, TypeIndex End);
+
+ /// Visited records get automatically added to the type database.
+ TypeDatabase Database;
+
+ /// The type array to allow random access visitation of.
+ const CVTypeArray &Types;
+
+ /// The database visitor which adds new records to the database.
+ TypeDatabaseVisitor DatabaseVisitor;
+
+ /// The deserializer which deserializes new records.
+ TypeDeserializer Deserializer;
+
+ /// The visitation callback pipeline to use. By default this contains a
+ /// deserializer and a type database visitor. But the callback specified
+ /// in the constructor is also added.
+ TypeVisitorCallbackPipeline Pipeline;
+
+ /// The visitor used to visit the internal pipeline for deserialization and
+ /// database maintenance.
+ CVTypeVisitor InternalVisitor;
+
+ /// A vector mapping type indices to type offset. For every record that has
+ /// been visited, contains the absolute offset of that record in the record
+ /// array.
+ std::vector<uint32_t> KnownOffsets;
+
+ /// An array of index offsets for the given type stream, allowing log(N)
+ /// lookups of a type record by index. Similar to KnownOffsets but only
+ /// contains offsets for some type indices, some of which may not have
+ /// ever been visited.
+ PartialOffsetArray PartialOffsets;
+};
+
+} // end namespace codeview
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H
diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabase.h b/include/llvm/DebugInfo/CodeView/TypeDatabase.h
index be7b19e7df0c..92c15ebd8b2b 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDatabase.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDatabase.h
@@ -10,6 +10,7 @@
#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASE_H
#define LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASE_H
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/DebugInfo/CodeView/TypeIndex.h"
@@ -20,14 +21,16 @@
namespace llvm {
namespace codeview {
class TypeDatabase {
+ friend class RandomAccessTypeVisitor;
+
public:
- explicit TypeDatabase(uint32_t ExpectedSize);
+ explicit TypeDatabase(uint32_t Capacity);
- /// Gets the type index for the next type record.
- TypeIndex getNextTypeIndex() const;
+ /// Records the name of a type, and reserves its type index.
+ TypeIndex appendType(StringRef Name, const CVType &Data);
/// Records the name of a type, and reserves its type index.
- void recordType(StringRef Name, const CVType &Data);
+ void recordType(StringRef Name, TypeIndex Index, const CVType &Data);
/// Saves the name in a StringSet and creates a stable StringRef.
StringRef saveTypeName(StringRef TypeName);
@@ -37,13 +40,21 @@ public:
const CVType &getTypeRecord(TypeIndex Index) const;
CVType &getTypeRecord(TypeIndex Index);
- bool containsTypeIndex(TypeIndex Index) const;
+ bool contains(TypeIndex Index) const;
uint32_t size() const;
+ uint32_t capacity() const;
+ bool empty() const;
+
+ TypeIndex getAppendIndex() const;
private:
+ void grow();
+
BumpPtrAllocator Allocator;
+ uint32_t Count = 0;
+
/// All user defined type records in .debug$T live in here. Type indices
/// greater than 0x1000 are user defined. Subtract 0x1000 from the index to
/// index into this vector.
@@ -51,6 +62,8 @@ private:
SmallVector<CVType, 10> TypeRecords;
StringSaver TypeNameStorage;
+
+ BitVector ValidRecords;
};
}
}
diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h
index 39d234cf9814..c064e19a7e90 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h
@@ -10,6 +10,8 @@
#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASEVISITOR_H
#define LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASEVISITOR_H
+#include "llvm/ADT/PointerUnion.h"
+
#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
#include "llvm/DebugInfo/CodeView/TypeIndex.h"
#include "llvm/DebugInfo/CodeView/TypeRecord.h"
@@ -21,11 +23,12 @@ namespace codeview {
/// Dumper for CodeView type streams found in COFF object files and PDB files.
class TypeDatabaseVisitor : public TypeVisitorCallbacks {
public:
- explicit TypeDatabaseVisitor(TypeDatabase &TypeDB) : TypeDB(TypeDB) {}
+ explicit TypeDatabaseVisitor(TypeDatabase &TypeDB) : TypeDB(&TypeDB) {}
/// Paired begin/end actions for all types. Receives all record data,
/// including the fixed-length record prefix.
Error visitTypeBegin(CVType &Record) override;
+ Error visitTypeBegin(CVType &Record, TypeIndex Index) override;
Error visitTypeEnd(CVType &Record) override;
Error visitMemberBegin(CVMemberRecord &Record) override;
Error visitMemberEnd(CVMemberRecord &Record) override;
@@ -39,12 +42,18 @@ public:
#include "TypeRecords.def"
private:
+ StringRef getTypeName(TypeIndex Index) const;
+ StringRef saveTypeName(StringRef Name);
+
bool IsInFieldList = false;
/// Name of the current type. Only valid before visitTypeEnd.
StringRef Name;
+ /// Current type index. Only valid before visitTypeEnd, and if we are
+ /// visiting a random access type database.
+ Optional<TypeIndex> CurrentTypeIndex;
- TypeDatabase &TypeDB;
+ TypeDatabase *TypeDB;
};
} // end namespace codeview
diff --git a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
index 0e3443789170..2142d4a2dec7 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
@@ -46,6 +46,10 @@ public:
return Mapping->Mapping.visitTypeBegin(Record);
}
+ Error visitTypeBegin(CVType &Record, TypeIndex Index) override {
+ return visitTypeBegin(Record);
+ }
+
Error visitTypeEnd(CVType &Record) override {
assert(Mapping && "Not in a type mapping!");
auto EC = Mapping->Mapping.visitTypeEnd(Record);
diff --git a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
index 00bb09137e48..6f10afb30d60 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
@@ -45,6 +45,7 @@ public:
/// Paired begin/end actions for all types. Receives all record data,
/// including the fixed-length record prefix.
Error visitTypeBegin(CVType &Record) override;
+ Error visitTypeBegin(CVType &Record, TypeIndex Index) override;
Error visitTypeEnd(CVType &Record) override;
Error visitMemberBegin(CVMemberRecord &Record) override;
Error visitMemberEnd(CVMemberRecord &Record) override;
diff --git a/include/llvm/DebugInfo/CodeView/TypeIndex.h b/include/llvm/DebugInfo/CodeView/TypeIndex.h
index 3c11d248fa72..b5d695fc49d5 100644
--- a/include/llvm/DebugInfo/CodeView/TypeIndex.h
+++ b/include/llvm/DebugInfo/CodeView/TypeIndex.h
@@ -106,6 +106,15 @@ public:
bool isNoneType() const { return *this == None(); }
+ uint32_t toArrayIndex() const {
+ assert(!isSimple());
+ return getIndex() - FirstNonSimpleIndex;
+ }
+
+ static TypeIndex fromArrayIndex(uint32_t Index) {
+ return TypeIndex(Index + FirstNonSimpleIndex);
+ }
+
SimpleTypeKind getSimpleKind() const {
assert(isSimple());
return static_cast<SimpleTypeKind>(Index & SimpleKindMask);
@@ -159,6 +168,39 @@ public:
static TypeIndex Float32() { return TypeIndex(SimpleTypeKind::Float32); }
static TypeIndex Float64() { return TypeIndex(SimpleTypeKind::Float64); }
+ TypeIndex &operator+=(unsigned N) {
+ Index += N;
+ return *this;
+ }
+
+ TypeIndex &operator++() {
+ Index += 1;
+ return *this;
+ }
+
+ TypeIndex operator++(int) {
+ TypeIndex Copy = *this;
+ operator++();
+ return Copy;
+ }
+
+ TypeIndex &operator-=(unsigned N) {
+ assert(Index >= N);
+ Index -= N;
+ return *this;
+ }
+
+ TypeIndex &operator--() {
+ Index -= 1;
+ return *this;
+ }
+
+ TypeIndex operator--(int) {
+ TypeIndex Copy = *this;
+ operator--();
+ return Copy;
+ }
+
friend inline bool operator==(const TypeIndex &A, const TypeIndex &B) {
return A.getIndex() == B.getIndex();
}
@@ -183,10 +225,30 @@ public:
return A.getIndex() >= B.getIndex();
}
+ friend inline TypeIndex operator+(const TypeIndex &A, uint32_t N) {
+ TypeIndex Result(A);
+ Result += N;
+ return Result;
+ }
+
+ friend inline TypeIndex operator-(const TypeIndex &A, uint32_t N) {
+ assert(A.getIndex() >= N);
+ TypeIndex Result(A);
+ Result -= N;
+ return Result;
+ }
+
private:
support::ulittle32_t Index;
};
+// Used for pseudo-indexing an array of type records. An array of such records
+// sorted by TypeIndex can allow log(N) lookups even though such a type record
+// stream does not provide random access.
+struct TypeIndexOffset {
+ TypeIndex Type;
+ support::ulittle32_t Offset;
+};
}
}
diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h
index f25129691041..ed48df33249f 100644
--- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h
+++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h
@@ -47,6 +47,14 @@ public:
return Error::success();
}
+ Error visitTypeBegin(CVType &Record, TypeIndex Index) override {
+ for (auto Visitor : Pipeline) {
+ if (auto EC = Visitor->visitTypeBegin(Record, Index))
+ return EC;
+ }
+ return Error::success();
+ }
+
Error visitTypeEnd(CVType &Record) override {
for (auto Visitor : Pipeline) {
if (auto EC = Visitor->visitTypeEnd(Record))
diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h
index 5e27df346b00..2950c7d27cb6 100644
--- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h
+++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h
@@ -26,8 +26,15 @@ public:
virtual Error visitUnknownType(CVType &Record) { return Error::success(); }
/// Paired begin/end actions for all types. Receives all record data,
/// including the fixed-length record prefix. visitTypeBegin() should return
- /// the type of the Record, or an error if it cannot be determined.
+ /// the type of the Record, or an error if it cannot be determined. Exactly
+ /// one of the two visitTypeBegin methods will be called, depending on whether
+ /// records are being visited sequentially or randomly. An implementation
+ /// should be prepared to handle both (or assert if it can't handle random
+ /// access visitation).
virtual Error visitTypeBegin(CVType &Record) { return Error::success(); }
+ virtual Error visitTypeBegin(CVType &Record, TypeIndex Index) {
+ return Error::success();
+ }
virtual Error visitTypeEnd(CVType &Record) { return Error::success(); }
virtual Error visitUnknownMember(CVMemberRecord &Record) {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 3fae8b441439..ca82a68ead31 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -43,13 +43,6 @@ namespace llvm {
class MemoryBuffer;
class raw_ostream;
-// In place of applying the relocations to the data we've read from disk we use
-// a separate mapping table to the side and checking that at locations in the
-// dwarf where we expect relocated values. This adds a bit of complexity to the
-// dwarf parsing/extraction at the benefit of not allocating memory for the
-// entire size of the debug info sections.
-typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t>> RelocAddrMap;
-
/// Reads a value from data extractor and applies a relocation to the result if
/// one exists for the given offset.
uint64_t getRelocatedValue(const DataExtractor &Data, uint32_t Size,
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index e21245b97b73..39a7ef71de97 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -30,7 +30,7 @@ public:
struct FileNameEntry {
FileNameEntry() = default;
- StringRef Name = StringRef();
+ StringRef Name;
uint64_t DirIdx = 0;
uint64_t ModTime = 0;
uint64_t Length = 0;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index 9172df5bfac6..23a573b7a9fa 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -22,8 +22,13 @@ namespace llvm {
class raw_ostream;
+struct DWARFAddressRange {
+ uint64_t LowPC;
+ uint64_t HighPC;
+};
+
/// DWARFAddressRangesVector - represents a set of absolute address ranges.
-typedef std::vector<std::pair<uint64_t, uint64_t>> DWARFAddressRangesVector;
+typedef std::vector<DWARFAddressRange> DWARFAddressRangesVector;
class DWARFDebugRangeList {
public:
diff --git a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
index af01bddeed15..f1e03bb4c2e1 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
@@ -16,7 +16,17 @@
namespace llvm {
-typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t>> RelocAddrMap;
+struct RelocAddrEntry {
+ uint8_t Width;
+ int64_t Value;
+};
+
+// In place of applying the relocations to the data we've read from disk we use
+// a separate mapping table to the side and checking that at locations in the
+// dwarf where we expect relocated values. This adds a bit of complexity to the
+// dwarf parsing/extraction at the benefit of not allocating memory for the
+// entire size of the debug info sections.
+typedef DenseMap<uint64_t, RelocAddrEntry> RelocAddrMap;
} // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 8e12bcd2c8e2..b9f14be85926 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -40,7 +40,7 @@ class DWARFVerifier {
///
/// @param Die The DWARF DIE that owns the attribute value
/// @param AttrValue The DWARF attribute value to check
- void verifyDebugInfoAttribute(DWARFDie &Die, DWARFAttribute &AttrValue);
+ void verifyDebugInfoAttribute(const DWARFDie &Die, DWARFAttribute &AttrValue);
/// Verifies the attribute's DWARF form.
///
@@ -51,7 +51,7 @@ class DWARFVerifier {
///
/// @param Die The DWARF DIE that owns the attribute value
/// @param AttrValue The DWARF attribute value to check
- void verifyDebugInfoForm(DWARFDie &Die, DWARFAttribute &AttrValue);
+ void verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue);
/// Verifies the all valid references that were found when iterating through
/// all of the DIE attributes.
@@ -60,7 +60,7 @@ class DWARFVerifier {
/// offset matches. This helps to ensure if a DWARF link phase moved things
/// around, that it doesn't create invalid references by failing to relocate
/// CU relative and absolute references.
- void veifyDebugInfoReferences();
+ void verifyDebugInfoReferences();
/// Verify the the DW_AT_stmt_list encoding and value and ensure that no
/// compile units that have the same DW_AT_stmt_list value.
diff --git a/include/llvm/DebugInfo/PDB/Native/RawTypes.h b/include/llvm/DebugInfo/PDB/Native/RawTypes.h
index 979b8454dd5e..771272d6a47d 100644
--- a/include/llvm/DebugInfo/PDB/Native/RawTypes.h
+++ b/include/llvm/DebugInfo/PDB/Native/RawTypes.h
@@ -73,13 +73,6 @@ struct SecMapEntry {
support::ulittle32_t SecByteLength; // Byte count of the segment or group.
};
-// Used for serialized hash table in TPI stream.
-// In the reference, it is an array of TI and cbOff pair.
-struct TypeIndexOffset {
- codeview::TypeIndex Type;
- support::ulittle32_t Offset;
-};
-
/// Some of the values are stored in bitfields. Since this needs to be portable
/// across compilers and architectures (big / little endian in particular) we
/// can't use the actual structures below, but must instead do the shifting
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index 9fef9bee5e1a..4579cbf4227b 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -47,7 +47,7 @@ public:
uint32_t getHashKeySize() const;
uint32_t getNumHashBuckets() const;
FixedStreamArray<support::ulittle32_t> getHashValues() const;
- FixedStreamArray<TypeIndexOffset> getTypeIndexOffsets() const;
+ FixedStreamArray<codeview::TypeIndexOffset> getTypeIndexOffsets() const;
HashTable &getHashAdjusters();
codeview::CVTypeRange types(bool *HadError) const;
@@ -62,7 +62,7 @@ private:
std::unique_ptr<BinaryStream> HashStream;
FixedStreamArray<support::ulittle32_t> HashValues;
- FixedStreamArray<TypeIndexOffset> TypeIndexOffsets;
+ FixedStreamArray<codeview::TypeIndexOffset> TypeIndexOffsets;
HashTable HashAdjusters;
const TpiStreamHeader *Header;
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
index a29ed0b610d3..6c609c34665c 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
@@ -75,7 +75,7 @@ private:
Optional<PdbRaw_TpiVer> VerHeader;
std::vector<ArrayRef<uint8_t>> TypeRecords;
std::vector<uint32_t> TypeHashes;
- std::vector<TypeIndexOffset> TypeIndexOffsets;
+ std::vector<codeview::TypeIndexOffset> TypeIndexOffsets;
uint32_t HashStreamIndex = kInvalidStreamIndex;
std::unique_ptr<BinaryByteStream> HashValueStream;
diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index 7e7f7358938a..1bb911d09cfb 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -172,6 +172,11 @@ private:
return nullptr;
}
+ void removeModulesFromBaseLayer(BaseLayerT &BaseLayer) {
+ for (auto &BLH : BaseLayerHandles)
+ BaseLayer.removeModuleSet(BLH);
+ }
+
std::unique_ptr<JITSymbolResolver> ExternalSymbolResolver;
std::unique_ptr<ResourceOwner<RuntimeDyld::MemoryManager>> MemMgr;
std::unique_ptr<IndirectStubsMgrT> StubsMgr;
@@ -204,6 +209,11 @@ public:
CreateIndirectStubsManager(std::move(CreateIndirectStubsManager)),
CloneStubsIntoPartitions(CloneStubsIntoPartitions) {}
+ ~CompileOnDemandLayer() {
+ while (!LogicalDylibs.empty())
+ removeModuleSet(LogicalDylibs.begin());
+ }
+
/// @brief Add a module to the compile-on-demand layer.
template <typename ModuleSetT, typename MemoryManagerPtrT,
typename SymbolResolverPtrT>
@@ -239,6 +249,7 @@ public:
/// This will remove all modules in the layers below that were derived from
/// the module represented by H.
void removeModuleSet(ModuleSetHandleT H) {
+ H->removeModulesFromBaseLayer(BaseLayer);
LogicalDylibs.erase(H);
}
@@ -478,6 +489,8 @@ private:
return 0;
}
+ LD.BaseLayerHandles.push_back(PartH);
+
return CalledAddr;
}
diff --git a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
index 02f59d6a831a..a19c30631c57 100644
--- a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
+++ b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
@@ -144,16 +144,16 @@ public:
void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
size_t Size) override {
- UnfinalizedEHFrames.push_back(
- std::make_pair(LoadAddr, static_cast<uint32_t>(Size)));
+ UnfinalizedEHFrames.push_back({LoadAddr, Size});
}
- void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
- size_t Size) override {
- auto Err = Client.deregisterEHFrames(LoadAddr, Size);
- // FIXME: Add error poll.
- assert(!Err && "Failed to register remote EH frames.");
- (void)Err;
+ void deregisterEHFrames() override {
+ for (auto &Frame : RegisteredEHFrames) {
+ auto Err = Client.deregisterEHFrames(Frame.Addr, Frame.Size);
+ // FIXME: Add error poll.
+ assert(!Err && "Failed to register remote EH frames.");
+ (void)Err;
+ }
}
void notifyObjectLoaded(RuntimeDyld &Dyld,
@@ -320,7 +320,7 @@ public:
Unfinalized.clear();
for (auto &EHFrame : UnfinalizedEHFrames) {
- if (auto Err = Client.registerEHFrames(EHFrame.first, EHFrame.second)) {
+ if (auto Err = Client.registerEHFrames(EHFrame.Addr, EHFrame.Size)) {
// FIXME: Replace this once finalizeMemory can return an Error.
handleAllErrors(std::move(Err), [&](ErrorInfoBase &EIB) {
if (ErrMsg) {
@@ -331,7 +331,8 @@ public:
return false;
}
}
- UnfinalizedEHFrames.clear();
+ RegisteredEHFrames = std::move(UnfinalizedEHFrames);
+ UnfinalizedEHFrames = {};
return false;
}
@@ -387,7 +388,13 @@ public:
ResourceIdMgr::ResourceId Id;
std::vector<ObjectAllocs> Unmapped;
std::vector<ObjectAllocs> Unfinalized;
- std::vector<std::pair<uint64_t, uint32_t>> UnfinalizedEHFrames;
+
+ struct EHFrame {
+ JITTargetAddress Addr;
+ uint64_t Size;
+ };
+ std::vector<EHFrame> UnfinalizedEHFrames;
+ std::vector<EHFrame> RegisteredEHFrames;
};
/// Remote indirect stubs manager.
diff --git a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index babcc7f26aab..5b3426afe584 100644
--- a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -120,6 +120,10 @@ private:
buildInitialSymbolTable(PFC->Objects);
}
+ ~ConcreteLinkedObjectSet() override {
+ MemMgr->deregisterEHFrames();
+ }
+
void setHandle(ObjSetHandleT H) {
PFC->Handle = H;
}
diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
index 5638717790bb..74535fe948ff 100644
--- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
+++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
@@ -69,13 +69,8 @@ public:
/// Deregister EH frames in the current proces.
static void deregisterEHFramesInProcess(uint8_t *Addr, size_t Size);
- void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override {
- registerEHFramesInProcess(Addr, Size);
- }
-
- void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override {
- deregisterEHFramesInProcess(Addr, Size);
- }
+ void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override;
+ void deregisterEHFrames() override;
/// This method returns the address of the specified function or variable in
/// the current process.
@@ -139,6 +134,13 @@ public:
/// MCJIT or RuntimeDyld. Use getSymbolAddress instead.
virtual void *getPointerToNamedFunction(const std::string &Name,
bool AbortOnFailure = true);
+
+private:
+ struct EHFrame {
+ uint8_t *Addr;
+ size_t Size;
+ };
+ std::vector<EHFrame> EHFrames;
};
// Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 13a5f9922c51..9470866dc0d6 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -150,8 +150,7 @@ public:
/// be the case for local execution) these two values will be the same.
virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
size_t Size) = 0;
- virtual void deregisterEHFrames(uint8_t *addr, uint64_t LoadAddr,
- size_t Size) = 0;
+ virtual void deregisterEHFrames() = 0;
/// This method is called when object loading is complete and section page
/// permissions can be applied. It is up to the memory manager implementation
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index cbe681684a5c..d4a896c01867 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -35,6 +35,7 @@ namespace llvm {
class AttrBuilder;
class AttributeImpl;
class AttributeListImpl;
+class AttributeList;
class AttributeSetNode;
template<typename T> struct DenseMapInfo;
class Function;
@@ -227,14 +228,51 @@ public:
bool operator==(const AttributeSet &O) { return SetNode == O.SetNode; }
bool operator!=(const AttributeSet &O) { return !(*this == O); }
+ /// Add an argument attribute. Because
+ /// attribute sets are immutable, this returns a new set.
+ AttributeSet addAttribute(LLVMContext &C,
+ Attribute::AttrKind Kind) const;
+
+ /// Add a target-dependent attribute. Because
+ /// attribute sets are immutable, this returns a new set.
+ AttributeSet addAttribute(LLVMContext &C, StringRef Kind,
+ StringRef Value = StringRef()) const;
+
+ /// Add attributes to the attribute set. Because
+ /// attribute sets are immutable, this returns a new set.
+ AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const;
+
+ /// Remove the specified attribute from this set. Because
+ /// attribute sets are immutable, this returns a new set.
+ AttributeSet removeAttribute(LLVMContext &C,
+ Attribute::AttrKind Kind) const;
+
+ /// Remove the specified attribute from this set. Because
+ /// attribute sets are immutable, this returns a new set.
+ AttributeSet removeAttribute(LLVMContext &C,
+ StringRef Kind) const;
+
+ /// Remove the specified attributes from this set. Because
+ /// attribute sets are immutable, this returns a new set.
+ AttributeSet removeAttributes(LLVMContext &C,
+ const AttrBuilder &AttrsToRemove) const;
+
+ /// Return the number of attributes in this set.
unsigned getNumAttributes() const;
+ /// Return true if attributes exists in this set.
bool hasAttributes() const { return SetNode != nullptr; }
+ /// Return true if the attribute exists in this set.
bool hasAttribute(Attribute::AttrKind Kind) const;
+
+ /// Return true if the attribute exists in this set.
bool hasAttribute(StringRef Kind) const;
+ /// Return the attribute object.
Attribute getAttribute(Attribute::AttrKind Kind) const;
+
+ /// Return the target-dependent attribute object.
Attribute getAttribute(StringRef Kind) const;
unsigned getAlignment() const;
@@ -248,6 +286,9 @@ public:
iterator begin() const;
iterator end() const;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() const;
+#endif
};
//===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index 39fb3f1c791b..801e88aba4d1 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -201,6 +201,10 @@ namespace CallingConv {
/// shaders)
AMDGPU_HS = 93,
+ /// Calling convention used for special MSP430 rtlib functions
+ /// which have an "optimized" convention using additional registers.
+ MSP430_BUILTIN = 94,
+
/// The highest possible calling convention ID. Must be some 2^k - 1.
MaxID = 1023
};
diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index ad83b21c7bf3..5db9b3bb5048 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@@ -26,6 +26,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/DerivedTypes.h"
@@ -452,7 +453,14 @@ class ConstantStruct final : public ConstantAggregate {
public:
// ConstantStruct accessors
static Constant *get(StructType *T, ArrayRef<Constant*> V);
- static Constant *get(StructType *T, ...) LLVM_END_WITH_NULL;
+
+ template <typename... Csts>
+ static typename std::enable_if<are_base_of<Constant, Csts...>::value,
+ Constant *>::type
+ get(StructType *T, Csts *... Vs) {
+ SmallVector<Constant *, 8> Values({Vs...});
+ return get(T, Values);
+ }
/// Return an anonymous struct that has the specified elements.
/// If the struct is possibly empty, then you must specify a context.
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 0331d5229e7f..358106aac43b 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -16,8 +16,11 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Casting.h"
@@ -56,10 +59,6 @@
namespace llvm {
-class DIBuilder;
-
-template <typename T> class Optional;
-
/// Holds a subclass of DINode.
///
/// FIXME: This class doesn't currently make much sense. Previously it was a
@@ -94,9 +93,9 @@ public:
bool operator!=(const TypedDINodeRef<T> &X) const { return MD != X.MD; }
};
-typedef TypedDINodeRef<DINode> DINodeRef;
-typedef TypedDINodeRef<DIScope> DIScopeRef;
-typedef TypedDINodeRef<DIType> DITypeRef;
+using DINodeRef = TypedDINodeRef<DINode>;
+using DIScopeRef = TypedDINodeRef<DIScope>;
+using DITypeRef = TypedDINodeRef<DIType>;
class DITypeRefArray {
const MDTuple *N = nullptr;
@@ -240,7 +239,8 @@ public:
};
template <class T> struct simplify_type<const TypedDINodeRef<T>> {
- typedef Metadata *SimpleType;
+ using SimpleType = Metadata *;
+
static SimpleType getSimplifiedValue(const TypedDINodeRef<T> &MD) {
return MD;
}
@@ -799,15 +799,18 @@ public:
assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
return DITypeRef(getExtraData());
}
+
DIObjCProperty *getObjCProperty() const {
return dyn_cast_or_null<DIObjCProperty>(getExtraData());
}
+
Constant *getStorageOffsetInBits() const {
assert(getTag() == dwarf::DW_TAG_member && isBitField());
if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
return C->getValue();
return nullptr;
}
+
Constant *getConstant() const {
assert(getTag() == dwarf::DW_TAG_member && isStaticMember());
if (auto *C = cast_or_null<ConstantAsMetadata>(getExtraData()))
@@ -970,9 +973,11 @@ public:
#endif
replaceOperandWith(4, Elements.get());
}
+
void replaceVTableHolder(DITypeRef VTableHolder) {
replaceOperandWith(5, VTableHolder);
}
+
void replaceTemplateParams(DITemplateParameterArray TemplateParams) {
replaceOperandWith(6, TemplateParams.get());
}
@@ -1031,6 +1036,7 @@ public:
DITypeRefArray getTypeArray() const {
return cast_or_null<MDTuple>(getRawTypeArray());
}
+
Metadata *getRawTypeArray() const { return getOperand(3); }
static bool classof(const Metadata *MD) {
@@ -1319,6 +1325,7 @@ public:
unsigned getLine() const { return SubclassData32; }
unsigned getColumn() const { return SubclassData16; }
DILocalScope *getScope() const { return cast<DILocalScope>(getRawScope()); }
+
DILocation *getInlinedAt() const {
return cast_or_null<DILocation>(getRawInlinedAt());
}
@@ -1452,7 +1459,6 @@ public:
static bool classof(const Metadata *MD) {
return MD->getMetadataID() == DILocationKind;
}
-
};
/// Subprogram description.
@@ -2087,6 +2093,7 @@ public:
return F->getFilename();
return "";
}
+
StringRef getDirectory() const {
if (auto *F = getFile())
return F->getDirectory();
@@ -2143,6 +2150,7 @@ public:
ArrayRef<uint64_t> getElements() const { return Elements; }
unsigned getNumElements() const { return Elements.size(); }
+
uint64_t getElement(unsigned I) const {
assert(I < Elements.size() && "Index out of range");
return Elements[I];
@@ -2151,7 +2159,8 @@ public:
/// Determine whether this represents a standalone constant value.
bool isConstant() const;
- typedef ArrayRef<uint64_t>::iterator element_iterator;
+ using element_iterator = ArrayRef<uint64_t>::iterator;
+
element_iterator elements_begin() const { return getElements().begin(); }
element_iterator elements_end() const { return getElements().end(); }
@@ -2276,6 +2285,10 @@ public:
/// Append \p Ops with operations to apply the \p Offset.
static void appendOffset(SmallVectorImpl<uint64_t> &Ops, int64_t Offset);
+ /// If this is a constant offset, extract it. If there is no expression,
+ /// return true with an offset of zero.
+ bool extractIfOffset(int64_t &Offset) const;
+
/// Constants for DIExpression::prepend.
enum { NoDeref = false, WithDeref = true, WithStackValue = true };
@@ -2509,6 +2522,7 @@ public:
return F->getFilename();
return "";
}
+
StringRef getDirectory() const {
if (auto *F = getFile())
return F->getDirectory();
@@ -2609,10 +2623,13 @@ public:
TempDIGlobalVariableExpression clone() const { return cloneImpl(); }
Metadata *getRawVariable() const { return getOperand(0); }
+
DIGlobalVariable *getVariable() const {
return cast_or_null<DIGlobalVariable>(getRawVariable());
}
+
Metadata *getRawExpression() const { return getOperand(1); }
+
DIExpression *getExpression() const {
return cast_or_null<DIExpression>(getRawExpression());
}
diff --git a/include/llvm/IR/DebugLoc.h b/include/llvm/IR/DebugLoc.h
index 202be3da14da..aa74f361cda2 100644
--- a/include/llvm/IR/DebugLoc.h
+++ b/include/llvm/IR/DebugLoc.h
@@ -80,6 +80,22 @@ namespace llvm {
static DebugLoc get(unsigned Line, unsigned Col, const MDNode *Scope,
const MDNode *InlinedAt = nullptr);
+ enum { ReplaceLastInlinedAt = true };
+ /// Rebuild the entire inlined-at chain for this instruction so that the top of
+ /// the chain now is inlined-at the new call site.
+ /// \param InlinedAt The new outermost inlined-at in the chain.
+ /// \param ReplaceLast Replace the last location in the inlined-at chain.
+ static DebugLoc appendInlinedAt(DebugLoc DL, DILocation *InlinedAt,
+ LLVMContext &Ctx,
+ DenseMap<const MDNode *, MDNode *> &Cache,
+ bool ReplaceLast = false);
+
+ /// Reparent all debug locations referenced by \c I that belong to \c OrigSP
+ /// to become (possibly indirect) children of \c NewSP.
+ static void reparentDebugInfo(Instruction &I, DISubprogram *OrigSP,
+ DISubprogram *NewSP,
+ DenseMap<const MDNode *, MDNode *> &Cache);
+
unsigned getLine() const;
unsigned getCol() const;
MDNode *getScope() const;
diff --git a/include/llvm/IR/DerivedTypes.h b/include/llvm/IR/DerivedTypes.h
index 05e99157b8dc..a92321a44511 100644
--- a/include/llvm/IR/DerivedTypes.h
+++ b/include/llvm/IR/DerivedTypes.h
@@ -1,4 +1,4 @@
-//===-- llvm/DerivedTypes.h - Classes for handling data types ---*- C++ -*-===//
+//===- llvm/DerivedTypes.h - Classes for handling data types ----*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -19,6 +19,7 @@
#define LLVM_IR_DERIVEDTYPES_H
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Casting.h"
@@ -122,7 +123,8 @@ public:
bool isVarArg() const { return getSubclassData()!=0; }
Type *getReturnType() const { return ContainedTys[0]; }
- typedef Type::subtype_iterator param_iterator;
+ using param_iterator = Type::subtype_iterator;
+
param_iterator param_begin() const { return ContainedTys + 1; }
param_iterator param_end() const { return &ContainedTys[NumContainedTys]; }
ArrayRef<Type *> params() const {
@@ -197,8 +199,7 @@ public:
/// generator for a target expects).
///
class StructType : public CompositeType {
- StructType(LLVMContext &C)
- : CompositeType(C, StructTyID), SymbolTableEntry(nullptr) {}
+ StructType(LLVMContext &C) : CompositeType(C, StructTyID) {}
enum {
/// This is the contents of the SubClassData field.
@@ -212,7 +213,7 @@ class StructType : public CompositeType {
/// symbol table entry (maintained by LLVMContext) for the struct.
/// This is null if the type is an literal struct or if it is a identified
/// type that has an empty name.
- void *SymbolTableEntry;
+ void *SymbolTableEntry = nullptr;
public:
StructType(const StructType &) = delete;
@@ -228,7 +229,14 @@ public:
static StructType *create(LLVMContext &Context, ArrayRef<Type *> Elements,
StringRef Name, bool isPacked = false);
static StructType *create(LLVMContext &Context, ArrayRef<Type *> Elements);
- static StructType *create(StringRef Name, Type *elt1, ...) LLVM_END_WITH_NULL;
+ template <class... Tys>
+ static typename std::enable_if<are_base_of<Type, Tys...>::value,
+ StructType *>::type
+ create(StringRef Name, Type *elt1, Tys *... elts) {
+ assert(elt1 && "Cannot create a struct type with no elements with this");
+ SmallVector<llvm::Type *, 8> StructFields({elt1, elts...});
+ return create(StructFields, Name);
+ }
/// This static method is the primary way to create a literal StructType.
static StructType *get(LLVMContext &Context, ArrayRef<Type*> Elements,
@@ -240,7 +248,15 @@ public:
/// This static method is a convenience method for creating structure types by
/// specifying the elements as arguments. Note that this method always returns
/// a non-packed struct, and requires at least one element type.
- static StructType *get(Type *elt1, ...) LLVM_END_WITH_NULL;
+ template <class... Tys>
+ static typename std::enable_if<are_base_of<Type, Tys...>::value,
+ StructType *>::type
+ get(Type *elt1, Tys *... elts) {
+ assert(elt1 && "Cannot create a struct type with no elements with this");
+ LLVMContext &Ctx = elt1->getContext();
+ SmallVector<llvm::Type *, 8> StructFields({elt1, elts...});
+ return llvm::StructType::get(Ctx, StructFields);
+ }
bool isPacked() const { return (getSubclassData() & SCDB_Packed) != 0; }
@@ -269,13 +285,21 @@ public:
/// Specify a body for an opaque identified type.
void setBody(ArrayRef<Type*> Elements, bool isPacked = false);
- void setBody(Type *elt1, ...) LLVM_END_WITH_NULL;
+
+ template <typename... Tys>
+ typename std::enable_if<are_base_of<Type, Tys...>::value, void>::type
+ setBody(Type *elt1, Tys *... elts) {
+ assert(elt1 && "Cannot create a struct type with no elements with this");
+ SmallVector<llvm::Type *, 8> StructFields({elt1, elts...});
+ setBody(StructFields);
+ }
/// Return true if the specified type is valid as a element type.
static bool isValidElementType(Type *ElemTy);
// Iterator access to the elements.
- typedef Type::subtype_iterator element_iterator;
+ using element_iterator = Type::subtype_iterator;
+
element_iterator element_begin() const { return ContainedTys; }
element_iterator element_end() const { return &ContainedTys[NumContainedTys];}
ArrayRef<Type *> const elements() const {
diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index 458c3cf29b0d..5497652135bd 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -15,7 +15,6 @@
#ifndef LLVM_IR_DIAGNOSTICINFO_H
#define LLVM_IR_DIAGNOSTICINFO_H
-#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
@@ -120,18 +119,18 @@ public:
virtual void print(DiagnosticPrinter &DP) const = 0;
};
-typedef std::function<void(const DiagnosticInfo &)> DiagnosticHandlerFunction;
+using DiagnosticHandlerFunction = std::function<void(const DiagnosticInfo &)>;
/// Diagnostic information for inline asm reporting.
/// This is basically a message and an optional location.
class DiagnosticInfoInlineAsm : public DiagnosticInfo {
private:
/// Optional line information. 0 if not set.
- unsigned LocCookie;
+ unsigned LocCookie = 0;
/// Message to be reported.
const Twine &MsgStr;
/// Optional origin of the problem.
- const Instruction *Instr;
+ const Instruction *Instr = nullptr;
public:
/// \p MsgStr is the message to be reported to the frontend.
@@ -139,8 +138,7 @@ public:
/// for the whole life time of the Diagnostic.
DiagnosticInfoInlineAsm(const Twine &MsgStr,
DiagnosticSeverity Severity = DS_Error)
- : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(0), MsgStr(MsgStr),
- Instr(nullptr) {}
+ : DiagnosticInfo(DK_InlineAsm, Severity), MsgStr(MsgStr) {}
/// \p LocCookie if non-zero gives the line number for this report.
/// \p MsgStr gives the message.
@@ -149,7 +147,7 @@ public:
DiagnosticInfoInlineAsm(unsigned LocCookie, const Twine &MsgStr,
DiagnosticSeverity Severity = DS_Error)
: DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(LocCookie),
- MsgStr(MsgStr), Instr(nullptr) {}
+ MsgStr(MsgStr) {}
/// \p Instr gives the original instruction that triggered the diagnostic.
/// \p MsgStr gives the message.
@@ -294,10 +292,10 @@ public:
DiagnosticInfoSampleProfile(StringRef FileName, const Twine &Msg,
DiagnosticSeverity Severity = DS_Error)
: DiagnosticInfo(DK_SampleProfile, Severity), FileName(FileName),
- LineNum(0), Msg(Msg) {}
+ Msg(Msg) {}
DiagnosticInfoSampleProfile(const Twine &Msg,
DiagnosticSeverity Severity = DS_Error)
- : DiagnosticInfo(DK_SampleProfile, Severity), LineNum(0), Msg(Msg) {}
+ : DiagnosticInfo(DK_SampleProfile, Severity), Msg(Msg) {}
/// \see DiagnosticInfo::print.
void print(DiagnosticPrinter &DP) const override;
@@ -316,7 +314,7 @@ private:
/// Line number where the diagnostic occurred. If 0, no line number will
/// be emitted in the message.
- unsigned LineNum;
+ unsigned LineNum = 0;
/// Message to report.
const Twine &Msg;
@@ -351,8 +349,9 @@ class DiagnosticLocation {
StringRef Filename;
unsigned Line = 0;
unsigned Column = 0;
+
public:
- DiagnosticLocation() {}
+ DiagnosticLocation() = default;
DiagnosticLocation(const DebugLoc &DL);
DiagnosticLocation(const DISubprogram *SP);
@@ -796,6 +795,7 @@ private:
const Twine &Msg)
: OptimizationRemarkAnalysis(DK_OptimizationRemarkAnalysisFPCommute,
PassName, Fn, Loc, Msg) {}
+
friend void emitOptimizationRemarkAnalysisFPCommute(
LLVMContext &Ctx, const char *PassName, const Function &Fn,
const DiagnosticLocation &Loc, const Twine &Msg);
@@ -1012,6 +1012,7 @@ public:
void print(DiagnosticPrinter &DP) const override;
};
+
} // end namespace llvm
#endif // LLVM_IR_DIAGNOSTICINFO_H
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index c12a125b6352..8a2a6ed87eb2 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -1,4 +1,4 @@
-//===-- llvm/Function.h - Class to represent a single function --*- C++ -*-===//
+//===- llvm/Function.h - Class to represent a single function ---*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -22,15 +22,19 @@
#include "llvm/ADT/ilist_node.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/SymbolTableListTraits.h"
#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Compiler.h"
#include <cassert>
#include <cstddef>
@@ -40,27 +44,31 @@
namespace llvm {
-template <typename T> class Optional;
class AssemblyAnnotationWriter;
-class FunctionType;
-class LLVMContext;
+class Constant;
class DISubprogram;
+class LLVMContext;
+class Module;
+template <typename T> class Optional;
+class raw_ostream;
+class Type;
+class User;
class Function : public GlobalObject, public ilist_node<Function> {
public:
- typedef SymbolTableList<BasicBlock> BasicBlockListType;
+ using BasicBlockListType = SymbolTableList<BasicBlock>;
// BasicBlock iterators...
- typedef BasicBlockListType::iterator iterator;
- typedef BasicBlockListType::const_iterator const_iterator;
+ using iterator = BasicBlockListType::iterator;
+ using const_iterator = BasicBlockListType::const_iterator;
- typedef Argument *arg_iterator;
- typedef const Argument *const_arg_iterator;
+ using arg_iterator = Argument *;
+ using const_arg_iterator = const Argument *;
private:
// Important things that make up a function!
- BasicBlockListType BasicBlocks; ///< The basic blocks
- mutable Argument *Arguments; ///< The formal arguments
+ BasicBlockListType BasicBlocks; ///< The basic blocks
+ mutable Argument *Arguments = nullptr; ///< The formal arguments
size_t NumArgs;
std::unique_ptr<ValueSymbolTable>
SymTab; ///< Symbol table of args/instructions
@@ -124,10 +132,12 @@ public:
// Provide fast operand accessors.
DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
/// Returns the FunctionType for me.
FunctionType *getFunctionType() const {
return cast<FunctionType>(getValueType());
}
+
/// Returns the type of the ret val.
Type *getReturnType() const { return getFunctionType()->getReturnType(); }
@@ -484,7 +494,7 @@ public:
/// copyAttributesFrom - copy all additional attributes (those not needed to
/// create a Function) from the Function Src to this one.
- void copyAttributesFrom(const GlobalValue *Src) override;
+ void copyAttributesFrom(const Function *Src);
/// deleteBody - This method deletes the body of the function, and converts
/// the linkage to external.
@@ -497,12 +507,12 @@ public:
/// removeFromParent - This method unlinks 'this' from the containing module,
/// but does not delete it.
///
- void removeFromParent() override;
+ void removeFromParent();
/// eraseFromParent - This method unlinks 'this' from the containing module
/// and deletes it.
///
- void eraseFromParent() override;
+ void eraseFromParent();
/// Steal arguments from another function.
///
diff --git a/include/llvm/IR/GetElementPtrTypeIterator.h b/include/llvm/IR/GetElementPtrTypeIterator.h
index 490bff29cf38..f017a449d33f 100644
--- a/include/llvm/IR/GetElementPtrTypeIterator.h
+++ b/include/llvm/IR/GetElementPtrTypeIterator.h
@@ -21,7 +21,9 @@
#include "llvm/IR/Operator.h"
#include "llvm/IR/User.h"
#include "llvm/Support/Casting.h"
+#include <cassert>
#include <cstddef>
+#include <cstdint>
#include <iterator>
namespace llvm {
@@ -29,13 +31,13 @@ namespace llvm {
template<typename ItTy = User::const_op_iterator>
class generic_gep_type_iterator
: public std::iterator<std::forward_iterator_tag, Type *, ptrdiff_t> {
- typedef std::iterator<std::forward_iterator_tag,
- Type *, ptrdiff_t> super;
+ using super = std::iterator<std::forward_iterator_tag, Type *, ptrdiff_t>;
ItTy OpIt;
PointerUnion<StructType *, Type *> CurTy;
enum : uint64_t { Unbounded = -1ull };
uint64_t NumElements = Unbounded;
+
generic_gep_type_iterator() = default;
public:
@@ -121,7 +123,7 @@ namespace llvm {
}
};
- typedef generic_gep_type_iterator<> gep_type_iterator;
+ using gep_type_iterator = generic_gep_type_iterator<>;
inline gep_type_iterator gep_type_begin(const User *GEP) {
auto *GEPOp = cast<GEPOperator>(GEP);
diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index 37a291dfeb7a..d4bf0d7e1ed4 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -59,15 +59,19 @@ public:
// Linkage, Type, Parent and AddressSpace taken from the Aliasee.
static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee);
+ void copyAttributesFrom(const GlobalValue *Src) {
+ GlobalValue::copyAttributesFrom(Src);
+ }
+
/// removeFromParent - This method unlinks 'this' from the containing module,
/// but does not delete it.
///
- void removeFromParent() override;
+ void removeFromParent();
/// eraseFromParent - This method unlinks 'this' from the containing module
/// and deletes it.
///
- void eraseFromParent() override;
+ void eraseFromParent();
/// These methods retrieve and set alias target.
void setAliasee(Constant *Aliasee);
diff --git a/include/llvm/IR/GlobalIFunc.h b/include/llvm/IR/GlobalIFunc.h
index bfaa9960cb13..d90c7c78ed26 100644
--- a/include/llvm/IR/GlobalIFunc.h
+++ b/include/llvm/IR/GlobalIFunc.h
@@ -47,12 +47,16 @@ public:
LinkageTypes Linkage, const Twine &Name,
Constant *Resolver, Module *Parent);
+ void copyAttributesFrom(const GlobalIFunc *Src) {
+ GlobalValue::copyAttributesFrom(Src);
+ }
+
/// This method unlinks 'this' from the containing module, but does not
/// delete it.
- void removeFromParent() final;
+ void removeFromParent();
/// This method unlinks 'this' from the containing module and deletes it.
- void eraseFromParent() final;
+ void eraseFromParent();
/// These methods retrieve and set ifunc resolver function.
void setResolver(Constant *Resolver) {
diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
index f3789bafefe3..fc38f698027b 100644
--- a/include/llvm/IR/GlobalObject.h
+++ b/include/llvm/IR/GlobalObject.h
@@ -150,8 +150,10 @@ public:
void addTypeMetadata(unsigned Offset, Metadata *TypeID);
- void copyAttributesFrom(const GlobalValue *Src) override;
+protected:
+ void copyAttributesFrom(const GlobalObject *Src);
+public:
// Methods for support type inquiry through isa, cast, and dyn_cast:
static inline bool classof(const Value *V) {
return V->getValueID() == Value::FunctionVal ||
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index bb30fa8be867..0793a1c0ee2e 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -435,14 +435,20 @@ public:
bool isWeakForLinker() const { return isWeakForLinker(getLinkage()); }
+protected:
/// Copy all additional attributes (those not needed to create a GlobalValue)
/// from the GlobalValue Src to this one.
- virtual void copyAttributesFrom(const GlobalValue *Src);
+ void copyAttributesFrom(const GlobalValue *Src);
- /// If special LLVM prefix that is used to inform the asm printer to not emit
- /// usual symbol prefix before the symbol name is used then return linkage
- /// name after skipping this special LLVM prefix.
- static StringRef getRealLinkageName(StringRef Name) {
+public:
+ /// If the given string begins with the GlobalValue name mangling escape
+ /// character '\1', drop it.
+ ///
+ /// This function applies a specific mangling that is used in PGO profiles,
+ /// among other things. If you're trying to get a symbol name for an
+ /// arbitrary GlobalValue, this is not the function you're looking for; see
+ /// Mangler.h.
+ static StringRef dropLLVMManglingEscape(StringRef Name) {
if (!Name.empty() && Name[0] == '\1')
return Name.substr(1);
return Name;
@@ -530,10 +536,10 @@ public:
/// This method unlinks 'this' from the containing module, but does not delete
/// it.
- virtual void removeFromParent() = 0;
+ void removeFromParent();
/// This method unlinks 'this' from the containing module and deletes it.
- virtual void eraseFromParent() = 0;
+ void eraseFromParent();
/// Get the module that this global value is contained inside of...
Module *getParent() { return Parent; }
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 3b545d811d44..21d334c8f01d 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -24,6 +24,7 @@
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/ilist_node.h"
#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/Attributes.h"
#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/Value.h"
#include <cassert>
@@ -41,6 +42,7 @@ class DIGlobalVariableExpression;
class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
friend class SymbolTableListTraits<GlobalVariable>;
+ AttributeSet Attrs;
bool isConstantGlobal : 1; // Is this a global constant?
bool isExternallyInitializedConstant : 1; // Is this a global whose value
// can change from its initial
@@ -156,17 +158,17 @@ public:
/// copyAttributesFrom - copy all additional attributes (those not needed to
/// create a GlobalVariable) from the GlobalVariable Src to this one.
- void copyAttributesFrom(const GlobalValue *Src) override;
+ void copyAttributesFrom(const GlobalVariable *Src);
/// removeFromParent - This method unlinks 'this' from the containing module,
/// but does not delete it.
///
- void removeFromParent() override;
+ void removeFromParent();
/// eraseFromParent - This method unlinks 'this' from the containing module
/// and deletes it.
///
- void eraseFromParent() override;
+ void eraseFromParent();
/// Drop all references in preparation to destroy the GlobalVariable. This
/// drops not only the reference to the initializer but also to any metadata.
@@ -178,6 +180,61 @@ public:
/// Fill the vector with all debug info attachements.
void getDebugInfo(SmallVectorImpl<DIGlobalVariableExpression *> &GVs) const;
+ /// Add attribute to this global.
+ void addAttribute(Attribute::AttrKind Kind) {
+ Attrs = Attrs.addAttribute(getContext(), Kind);
+ }
+
+ /// Add attribute to this global.
+ void addAttribute(StringRef Kind, StringRef Val = StringRef()) {
+ Attrs = Attrs.addAttribute(getContext(), Kind, Val);
+ }
+
+ /// Return true if the attribute exists.
+ bool hasAttribute(Attribute::AttrKind Kind) const {
+ return Attrs.hasAttribute(Kind);
+ }
+
+ /// Return true if the attribute exists.
+ bool hasAttribute(StringRef Kind) const {
+ return Attrs.hasAttribute(Kind);
+ }
+
+ /// Return true if any attributes exist.
+ bool hasAttributes() const {
+ return Attrs.hasAttributes();
+ }
+
+ /// Return the attribute object.
+ Attribute getAttribute(Attribute::AttrKind Kind) const {
+ return Attrs.getAttribute(Kind);
+ }
+
+ /// Return the attribute object.
+ Attribute getAttribute(StringRef Kind) const {
+ return Attrs.getAttribute(Kind);
+ }
+
+ /// Return the attribute set for this global
+ AttributeSet getAttributes() const {
+ return Attrs;
+ }
+
+ /// Return attribute set as list with index.
+ /// FIXME: This may not be required once ValueEnumerators
+ /// in bitcode-writer can enumerate attribute-set.
+ AttributeList getAttributesAsList(unsigned index) const {
+ if (!hasAttributes())
+ return AttributeList();
+ std::pair<unsigned, AttributeSet> AS[1] = {{index, Attrs}};
+ return AttributeList::get(getContext(), AS);
+ }
+
+ /// Set attribute list for this global
+ void setAttributes(AttributeSet A) {
+ Attrs = A;
+ }
+
// Methods for support type inquiry through isa, cast, and dyn_cast:
static inline bool classof(const Value *V) {
return V->getValueID() == Value::GlobalVariableVal;
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index bc689f3b01d7..9d4c13c29f68 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -454,6 +454,45 @@ public:
MDNode *ScopeTag = nullptr,
MDNode *NoAliasTag = nullptr);
+ /// \brief Create a vector fadd reduction intrinsic of the source vector.
+ /// The first parameter is a scalar accumulator value for ordered reductions.
+ CallInst *CreateFAddReduce(Value *Acc, Value *Src);
+
+ /// \brief Create a vector fmul reduction intrinsic of the source vector.
+ /// The first parameter is a scalar accumulator value for ordered reductions.
+ CallInst *CreateFMulReduce(Value *Acc, Value *Src);
+
+ /// \brief Create a vector int add reduction intrinsic of the source vector.
+ CallInst *CreateAddReduce(Value *Src);
+
+ /// \brief Create a vector int mul reduction intrinsic of the source vector.
+ CallInst *CreateMulReduce(Value *Src);
+
+ /// \brief Create a vector int AND reduction intrinsic of the source vector.
+ CallInst *CreateAndReduce(Value *Src);
+
+ /// \brief Create a vector int OR reduction intrinsic of the source vector.
+ CallInst *CreateOrReduce(Value *Src);
+
+ /// \brief Create a vector int XOR reduction intrinsic of the source vector.
+ CallInst *CreateXorReduce(Value *Src);
+
+ /// \brief Create a vector integer max reduction intrinsic of the source
+ /// vector.
+ CallInst *CreateIntMaxReduce(Value *Src, bool IsSigned = false);
+
+ /// \brief Create a vector integer min reduction intrinsic of the source
+ /// vector.
+ CallInst *CreateIntMinReduce(Value *Src, bool IsSigned = false);
+
+ /// \brief Create a vector float max reduction intrinsic of the source
+ /// vector.
+ CallInst *CreateFPMaxReduce(Value *Src, bool NoNaN = false);
+
+ /// \brief Create a vector float min reduction intrinsic of the source
+ /// vector.
+ CallInst *CreateFPMinReduce(Value *Src, bool NoNaN = false);
+
/// \brief Create a lifetime.start intrinsic.
///
/// If the pointer isn't i8* it will be converted.
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index d16a5d318d78..61ca90de7393 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -65,27 +65,15 @@ protected:
// Out of line virtual method, so the vtable, etc has a home.
~TerminatorInst() override;
- /// Virtual methods - Terminators should overload these and provide inline
- /// overrides of non-V methods.
- virtual BasicBlock *getSuccessorV(unsigned idx) const = 0;
- virtual unsigned getNumSuccessorsV() const = 0;
- virtual void setSuccessorV(unsigned idx, BasicBlock *B) = 0;
-
public:
/// Return the number of successors that this terminator has.
- unsigned getNumSuccessors() const {
- return getNumSuccessorsV();
- }
+ unsigned getNumSuccessors() const;
/// Return the specified successor.
- BasicBlock *getSuccessor(unsigned idx) const {
- return getSuccessorV(idx);
- }
+ BasicBlock *getSuccessor(unsigned idx) const;
/// Update the specified successor to point at the provided block.
- void setSuccessor(unsigned idx, BasicBlock *B) {
- setSuccessorV(idx, B);
- }
+ void setSuccessor(unsigned idx, BasicBlock *B);
// Methods for support type inquiry through isa, cast, and dyn_cast:
static inline bool classof(const Instruction *I) {
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index 90c3175122fd..fca29900f4c2 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -456,6 +456,12 @@ public:
/// higher.
bool isAtomic() const;
+ /// Return true if this atomic instruction loads from memory.
+ bool hasAtomicLoad() const;
+
+ /// Return true if this atomic instruction stores to memory.
+ bool hasAtomicStore() const;
+
/// Return true if this instruction may throw an exception.
bool mayThrow() const;
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 844a7273eca9..c26701af27ce 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -1,4 +1,4 @@
-//===-- llvm/Instructions.h - Instruction subclass definitions --*- C++ -*-===//
+//===- llvm/Instructions.h - Instruction subclass definitions ---*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -17,6 +17,7 @@
#define LLVM_IR_INSTRUCTIONS_H
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/SmallVector.h"
@@ -24,21 +25,25 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
+#include <iterator>
namespace llvm {
@@ -264,6 +269,7 @@ public:
}
bool isSimple() const { return !isAtomic() && !isVolatile(); }
+
bool isUnordered() const {
return (getOrdering() == AtomicOrdering::NotAtomic ||
getOrdering() == AtomicOrdering::Unordered) &&
@@ -386,6 +392,7 @@ public:
}
bool isSimple() const { return !isAtomic() && !isVolatile(); }
+
bool isUnordered() const {
return (getOrdering() == AtomicOrdering::NotAtomic ||
getOrdering() == AtomicOrdering::Unordered) &&
@@ -836,10 +843,7 @@ class GetElementPtrInst : public Instruction {
Type *SourceElementType;
Type *ResultElementType;
- void anchor() override;
-
GetElementPtrInst(const GetElementPtrInst &GEPI);
- void init(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &NameStr);
/// Constructors - Create a getelementptr instruction with a base pointer an
/// list of indices. The first ctor can optionally insert before an existing
@@ -852,6 +856,9 @@ class GetElementPtrInst : public Instruction {
ArrayRef<Value *> IdxList, unsigned Values,
const Twine &NameStr, BasicBlock *InsertAtEnd);
+ void anchor() override;
+ void init(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &NameStr);
+
protected:
// Note: Instruction needs to be a friend here to call cloneImpl.
friend class Instruction;
@@ -2261,6 +2268,19 @@ public:
return Mask;
}
+ /// Change values in a shuffle permute mask assuming the two vector operands
+ /// of length InVecNumElts have swapped position.
+ static void commuteShuffleMask(MutableArrayRef<int> Mask,
+ unsigned InVecNumElts) {
+ for (int &Idx : Mask) {
+ if (Idx == -1)
+ continue;
+ Idx = Idx < (int)InVecNumElts ? Idx + InVecNumElts : Idx - InVecNumElts;
+ assert(Idx >= 0 && Idx < (int)InVecNumElts * 2 &&
+ "shufflevector mask index out of range");
+ }
+ }
+
// Methods for support type inquiry through isa, cast, and dyn_cast:
static inline bool classof(const Instruction *I) {
return I->getOpcode() == Instruction::ShuffleVector;
@@ -2288,6 +2308,7 @@ class ExtractValueInst : public UnaryInstruction {
SmallVector<unsigned, 4> Indices;
ExtractValueInst(const ExtractValueInst &EVI);
+
/// Constructors - Create a extractvalue instruction with a base aggregate
/// value and a list of indices. The first ctor can optionally insert before
/// an existing instruction, the second appends the new instruction to the
@@ -2333,7 +2354,8 @@ public:
/// Null is returned if the indices are invalid for the specified type.
static Type *getIndexedType(Type *Agg, ArrayRef<unsigned> Idxs);
- typedef const unsigned* idx_iterator;
+ using idx_iterator = const unsigned*;
+
inline idx_iterator idx_begin() const { return Indices.begin(); }
inline idx_iterator idx_end() const { return Indices.end(); }
inline iterator_range<idx_iterator> indices() const {
@@ -2455,7 +2477,8 @@ public:
/// Transparently provide more efficient getOperand methods.
DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
- typedef const unsigned* idx_iterator;
+ using idx_iterator = const unsigned*;
+
inline idx_iterator idx_begin() const { return Indices.begin(); }
inline idx_iterator idx_end() const { return Indices.end(); }
inline iterator_range<idx_iterator> indices() const {
@@ -2606,8 +2629,8 @@ public:
// Block iterator interface. This provides access to the list of incoming
// basic blocks, which parallels the list of incoming values.
- typedef BasicBlock **block_iterator;
- typedef BasicBlock * const *const_block_iterator;
+ using block_iterator = BasicBlock **;
+ using const_block_iterator = BasicBlock * const *;
block_iterator block_begin() {
Use::UserRef *ref =
@@ -2656,9 +2679,11 @@ public:
"All operands to PHI node must be the same type as the PHI node!");
setOperand(i, V);
}
+
static unsigned getOperandNumForIncomingValue(unsigned i) {
return i;
}
+
static unsigned getIncomingValueNumForOperand(unsigned i) {
return i;
}
@@ -2937,9 +2962,11 @@ public:
}
private:
- BasicBlock *getSuccessorV(unsigned idx) const override;
- unsigned getNumSuccessorsV() const override;
- void setSuccessorV(unsigned idx, BasicBlock *B) override;
+ friend TerminatorInst;
+
+ BasicBlock *getSuccessorV(unsigned idx) const;
+ unsigned getNumSuccessorsV() const;
+ void setSuccessorV(unsigned idx, BasicBlock *B);
};
template <>
@@ -3047,9 +3074,11 @@ public:
}
private:
- BasicBlock *getSuccessorV(unsigned idx) const override;
- unsigned getNumSuccessorsV() const override;
- void setSuccessorV(unsigned idx, BasicBlock *B) override;
+ friend TerminatorInst;
+
+ BasicBlock *getSuccessorV(unsigned idx) const;
+ unsigned getNumSuccessorsV() const;
+ void setSuccessorV(unsigned idx, BasicBlock *B);
};
template <>
@@ -3123,7 +3152,7 @@ public:
protected:
// Expose the switch type we're parameterized with to the iterator.
- typedef SwitchInstT SwitchInstType;
+ using SwitchInstType = SwitchInstT;
SwitchInstT *SI;
ptrdiff_t Index;
@@ -3164,8 +3193,8 @@ public:
}
};
- typedef CaseHandleImpl<const SwitchInst, const ConstantInt, const BasicBlock>
- ConstCaseHandle;
+ using ConstCaseHandle =
+ CaseHandleImpl<const SwitchInst, const ConstantInt, const BasicBlock>;
class CaseHandle
: public CaseHandleImpl<SwitchInst, ConstantInt, BasicBlock> {
@@ -3192,7 +3221,7 @@ public:
: public iterator_facade_base<CaseIteratorImpl<CaseHandleT>,
std::random_access_iterator_tag,
CaseHandleT> {
- typedef typename CaseHandleT::SwitchInstType SwitchInstT;
+ using SwitchInstT = typename CaseHandleT::SwitchInstType;
CaseHandleT Case;
@@ -3254,8 +3283,8 @@ public:
const CaseHandleT &operator*() const { return Case; }
};
- typedef CaseIteratorImpl<CaseHandle> CaseIt;
- typedef CaseIteratorImpl<ConstCaseHandle> ConstCaseIt;
+ using CaseIt = CaseIteratorImpl<CaseHandle>;
+ using ConstCaseIt = CaseIteratorImpl<ConstCaseHandle>;
static SwitchInst *Create(Value *Value, BasicBlock *Default,
unsigned NumCases,
@@ -3411,9 +3440,11 @@ public:
}
private:
- BasicBlock *getSuccessorV(unsigned idx) const override;
- unsigned getNumSuccessorsV() const override;
- void setSuccessorV(unsigned idx, BasicBlock *B) override;
+ friend TerminatorInst;
+
+ BasicBlock *getSuccessorV(unsigned idx) const;
+ unsigned getNumSuccessorsV() const;
+ void setSuccessorV(unsigned idx, BasicBlock *B);
};
template <>
@@ -3516,9 +3547,11 @@ public:
}
private:
- BasicBlock *getSuccessorV(unsigned idx) const override;
- unsigned getNumSuccessorsV() const override;
- void setSuccessorV(unsigned idx, BasicBlock *B) override;
+ friend TerminatorInst;
+
+ BasicBlock *getSuccessorV(unsigned idx) const;
+ unsigned getNumSuccessorsV() const;
+ void setSuccessorV(unsigned idx, BasicBlock *B);
};
template <>
@@ -3639,6 +3672,7 @@ public:
return new (Values) InvokeInst(Func, IfNormal, IfException, Args, None,
Values, NameStr, InsertAtEnd);
}
+
static InvokeInst *Create(Value *Func, BasicBlock *IfNormal,
BasicBlock *IfException, ArrayRef<Value *> Args,
ArrayRef<OperandBundleDef> Bundles,
@@ -3996,9 +4030,11 @@ public:
}
private:
- BasicBlock *getSuccessorV(unsigned idx) const override;
- unsigned getNumSuccessorsV() const override;
- void setSuccessorV(unsigned idx, BasicBlock *B) override;
+ friend TerminatorInst;
+
+ BasicBlock *getSuccessorV(unsigned idx) const;
+ unsigned getNumSuccessorsV() const;
+ void setSuccessorV(unsigned idx, BasicBlock *B);
template <typename AttrKind> bool hasFnAttrImpl(AttrKind Kind) const {
if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind))
@@ -4095,9 +4131,11 @@ public:
}
private:
- BasicBlock *getSuccessorV(unsigned idx) const override;
- unsigned getNumSuccessorsV() const override;
- void setSuccessorV(unsigned idx, BasicBlock *B) override;
+ friend TerminatorInst;
+
+ BasicBlock *getSuccessorV(unsigned idx) const;
+ unsigned getNumSuccessorsV() const;
+ void setSuccessorV(unsigned idx, BasicBlock *B);
};
template <>
@@ -4202,13 +4240,14 @@ private:
}
public:
- typedef std::pointer_to_unary_function<Value *, BasicBlock *> DerefFnTy;
- typedef mapped_iterator<op_iterator, DerefFnTy> handler_iterator;
- typedef iterator_range<handler_iterator> handler_range;
- typedef std::pointer_to_unary_function<const Value *, const BasicBlock *>
- ConstDerefFnTy;
- typedef mapped_iterator<const_op_iterator, ConstDerefFnTy> const_handler_iterator;
- typedef iterator_range<const_handler_iterator> const_handler_range;
+ using DerefFnTy = std::pointer_to_unary_function<Value *, BasicBlock *>;
+ using handler_iterator = mapped_iterator<op_iterator, DerefFnTy>;
+ using handler_range = iterator_range<handler_iterator>;
+ using ConstDerefFnTy =
+ std::pointer_to_unary_function<const Value *, const BasicBlock *>;
+ using const_handler_iterator =
+ mapped_iterator<const_op_iterator, ConstDerefFnTy>;
+ using const_handler_range = iterator_range<const_handler_iterator>;
/// Returns an iterator that points to the first handler in CatchSwitchInst.
handler_iterator handler_begin() {
@@ -4278,9 +4317,11 @@ public:
}
private:
- BasicBlock *getSuccessorV(unsigned Idx) const override;
- unsigned getNumSuccessorsV() const override;
- void setSuccessorV(unsigned Idx, BasicBlock *B) override;
+ friend TerminatorInst;
+
+ BasicBlock *getSuccessorV(unsigned Idx) const;
+ unsigned getNumSuccessorsV() const;
+ void setSuccessorV(unsigned Idx, BasicBlock *B);
};
template <>
@@ -4443,9 +4484,11 @@ public:
}
private:
- BasicBlock *getSuccessorV(unsigned Idx) const override;
- unsigned getNumSuccessorsV() const override;
- void setSuccessorV(unsigned Idx, BasicBlock *B) override;
+ friend TerminatorInst;
+
+ BasicBlock *getSuccessorV(unsigned Idx) const;
+ unsigned getNumSuccessorsV() const;
+ void setSuccessorV(unsigned Idx, BasicBlock *B);
};
template <>
@@ -4531,9 +4574,11 @@ public:
}
private:
- BasicBlock *getSuccessorV(unsigned Idx) const override;
- unsigned getNumSuccessorsV() const override;
- void setSuccessorV(unsigned Idx, BasicBlock *B) override;
+ friend TerminatorInst;
+
+ BasicBlock *getSuccessorV(unsigned Idx) const;
+ unsigned getNumSuccessorsV() const;
+ void setSuccessorV(unsigned Idx, BasicBlock *B);
// Shadow Instruction::setInstructionSubclassData with a private forwarding
// method so that subclasses cannot accidentally use it.
@@ -4586,9 +4631,11 @@ public:
}
private:
- BasicBlock *getSuccessorV(unsigned idx) const override;
- unsigned getNumSuccessorsV() const override;
- void setSuccessorV(unsigned idx, BasicBlock *B) override;
+ friend TerminatorInst;
+
+ BasicBlock *getSuccessorV(unsigned idx) const;
+ unsigned getNumSuccessorsV() const;
+ void setSuccessorV(unsigned idx, BasicBlock *B);
};
//===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 7b78d4d3d34a..19f6045568f4 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -812,6 +812,50 @@ def int_memcpy_element_atomic : Intrinsic<[],
[IntrArgMemOnly, NoCapture<0>, NoCapture<1>,
WriteOnly<0>, ReadOnly<1>]>;
+//===------------------------ Reduction Intrinsics ------------------------===//
+//
+def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty],
+ [llvm_anyfloat_ty,
+ llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty],
+ [llvm_anyfloat_ty,
+ llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_mul : Intrinsic<[llvm_anyint_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_and : Intrinsic<[llvm_anyint_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_or : Intrinsic<[llvm_anyint_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_xor : Intrinsic<[llvm_anyint_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_smax : Intrinsic<[llvm_anyint_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_smin : Intrinsic<[llvm_anyint_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_umax : Intrinsic<[llvm_anyint_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_umin : Intrinsic<[llvm_anyint_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_fmax : Intrinsic<[llvm_anyfloat_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+def int_experimental_vector_reduce_fmin : Intrinsic<[llvm_anyfloat_ty],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
//===----- Intrinsics that are used to provide predicate information -----===//
def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index d13d5ddaeb3c..ad011fb72e6a 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h
@@ -1,4 +1,4 @@
-//===-- llvm/LLVMContext.h - Class for managing "global" state --*- C++ -*-===//
+//===- llvm/LLVMContext.h - Class for managing "global" state ---*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -37,7 +37,9 @@ class StringRef;
class Twine;
namespace yaml {
+
class Output;
+
} // end namespace yaml
/// This is an important class for using LLVM in a threaded context. It
@@ -134,17 +136,17 @@ public:
void enableDebugTypeODRUniquing();
void disableDebugTypeODRUniquing();
- typedef void (*InlineAsmDiagHandlerTy)(const SMDiagnostic&, void *Context,
- unsigned LocCookie);
+ using InlineAsmDiagHandlerTy = void (*)(const SMDiagnostic&, void *Context,
+ unsigned LocCookie);
/// Defines the type of a diagnostic handler.
/// \see LLVMContext::setDiagnosticHandler.
/// \see LLVMContext::diagnose.
- typedef void (*DiagnosticHandlerTy)(const DiagnosticInfo &DI, void *Context);
+ using DiagnosticHandlerTy = void (*)(const DiagnosticInfo &DI, void *Context);
/// Defines the type of a yield callback.
/// \see LLVMContext::setYieldCallback.
- typedef void (*YieldCallbackTy)(LLVMContext *Context, void *OpaqueHandle);
+ using YieldCallbackTy = void (*)(LLVMContext *Context, void *OpaqueHandle);
/// setInlineAsmDiagnosticHandler - This method sets a handler that is invoked
/// when problems with inline asm are detected by the backend. The first
diff --git a/include/llvm/IR/LegacyPassManager.h b/include/llvm/IR/LegacyPassManager.h
index 5257a0eed488..9a376a151505 100644
--- a/include/llvm/IR/LegacyPassManager.h
+++ b/include/llvm/IR/LegacyPassManager.h
@@ -98,6 +98,9 @@ private:
// Create wrappers for C Binding types (see CBindingWrapping.h).
DEFINE_STDCXX_CONVERSION_FUNCTIONS(legacy::PassManagerBase, LLVMPassManagerRef)
+/// If -time-passes has been specified, report the timings immediately and then
+/// reset the timers to zero.
+void reportAndResetTimings();
} // End llvm namespace
#endif
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 67c35cd22b34..3024d9e27a2f 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -1,4 +1,4 @@
-//===-- llvm/Module.h - C++ class to represent a VM module ------*- C++ -*-===//
+//===- llvm/Module.h - C++ class to represent a VM module -------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -16,6 +16,10 @@
#define LLVM_IR_MODULE_H
#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Attributes.h"
#include "llvm/IR/Comdat.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
@@ -23,20 +27,27 @@
#include "llvm/IR/GlobalIFunc.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/IR/SymbolTableListTraits.h"
#include "llvm/Support/CBindingWrapping.h"
#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm-c/Types.h"
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
namespace llvm {
-template <typename T> class Optional;
+
class Error;
class FunctionType;
class GVMaterializer;
class LLVMContext;
class MemoryBuffer;
class RandomNumberGenerator;
-class StructType;
template <class PtrType> class SmallPtrSetImpl;
+class StructType;
/// A Module instance is used to store all the information related to an
/// LLVM module. Modules are the top level container of all other LLVM
@@ -54,47 +65,47 @@ class Module {
/// @{
public:
/// The type for the list of global variables.
- typedef SymbolTableList<GlobalVariable> GlobalListType;
+ using GlobalListType = SymbolTableList<GlobalVariable>;
/// The type for the list of functions.
- typedef SymbolTableList<Function> FunctionListType;
+ using FunctionListType = SymbolTableList<Function>;
/// The type for the list of aliases.
- typedef SymbolTableList<GlobalAlias> AliasListType;
+ using AliasListType = SymbolTableList<GlobalAlias>;
/// The type for the list of ifuncs.
- typedef SymbolTableList<GlobalIFunc> IFuncListType;
+ using IFuncListType = SymbolTableList<GlobalIFunc>;
/// The type for the list of named metadata.
- typedef ilist<NamedMDNode> NamedMDListType;
+ using NamedMDListType = ilist<NamedMDNode>;
/// The type of the comdat "symbol" table.
- typedef StringMap<Comdat> ComdatSymTabType;
+ using ComdatSymTabType = StringMap<Comdat>;
/// The Global Variable iterator.
- typedef GlobalListType::iterator global_iterator;
+ using global_iterator = GlobalListType::iterator;
/// The Global Variable constant iterator.
- typedef GlobalListType::const_iterator const_global_iterator;
+ using const_global_iterator = GlobalListType::const_iterator;
/// The Function iterators.
- typedef FunctionListType::iterator iterator;
+ using iterator = FunctionListType::iterator;
/// The Function constant iterator
- typedef FunctionListType::const_iterator const_iterator;
+ using const_iterator = FunctionListType::const_iterator;
/// The Function reverse iterator.
- typedef FunctionListType::reverse_iterator reverse_iterator;
+ using reverse_iterator = FunctionListType::reverse_iterator;
/// The Function constant reverse iterator.
- typedef FunctionListType::const_reverse_iterator const_reverse_iterator;
+ using const_reverse_iterator = FunctionListType::const_reverse_iterator;
/// The Global Alias iterators.
- typedef AliasListType::iterator alias_iterator;
+ using alias_iterator = AliasListType::iterator;
/// The Global Alias constant iterator
- typedef AliasListType::const_iterator const_alias_iterator;
+ using const_alias_iterator = AliasListType::const_iterator;
/// The Global IFunc iterators.
- typedef IFuncListType::iterator ifunc_iterator;
+ using ifunc_iterator = IFuncListType::iterator;
/// The Global IFunc constant iterator
- typedef IFuncListType::const_iterator const_ifunc_iterator;
+ using const_ifunc_iterator = IFuncListType::const_iterator;
/// The named metadata iterators.
- typedef NamedMDListType::iterator named_metadata_iterator;
+ using named_metadata_iterator = NamedMDListType::iterator;
/// The named metadata constant iterators.
- typedef NamedMDListType::const_iterator const_named_metadata_iterator;
+ using const_named_metadata_iterator = NamedMDListType::const_iterator;
/// This enumeration defines the supported behaviors of module flags.
enum ModFlagBehavior {
@@ -141,6 +152,7 @@ public:
ModFlagBehavior Behavior;
MDString *Key;
Metadata *Val;
+
ModuleFlagEntry(ModFlagBehavior B, MDString *K, Metadata *V)
: Behavior(B), Key(K), Val(V) {}
};
@@ -483,9 +495,11 @@ public:
const GlobalListType &getGlobalList() const { return GlobalList; }
/// Get the Module's list of global variables.
GlobalListType &getGlobalList() { return GlobalList; }
+
static GlobalListType Module::*getSublistAccess(GlobalVariable*) {
return &Module::GlobalList;
}
+
/// Get the Module's list of functions (constant).
const FunctionListType &getFunctionList() const { return FunctionList; }
/// Get the Module's list of functions.
@@ -493,31 +507,39 @@ public:
static FunctionListType Module::*getSublistAccess(Function*) {
return &Module::FunctionList;
}
+
/// Get the Module's list of aliases (constant).
const AliasListType &getAliasList() const { return AliasList; }
/// Get the Module's list of aliases.
AliasListType &getAliasList() { return AliasList; }
+
static AliasListType Module::*getSublistAccess(GlobalAlias*) {
return &Module::AliasList;
}
+
/// Get the Module's list of ifuncs (constant).
const IFuncListType &getIFuncList() const { return IFuncList; }
/// Get the Module's list of ifuncs.
IFuncListType &getIFuncList() { return IFuncList; }
+
static IFuncListType Module::*getSublistAccess(GlobalIFunc*) {
return &Module::IFuncList;
}
+
/// Get the Module's list of named metadata (constant).
const NamedMDListType &getNamedMDList() const { return NamedMDList; }
/// Get the Module's list of named metadata.
NamedMDListType &getNamedMDList() { return NamedMDList; }
+
static NamedMDListType Module::*getSublistAccess(NamedMDNode*) {
return &Module::NamedMDList;
}
+
/// Get the symbol table of global variable and function identifiers
const ValueSymbolTable &getValueSymbolTable() const { return *ValSymTab; }
/// Get the Module's symbol table of global variable and function identifiers.
ValueSymbolTable &getValueSymbolTable() { return *ValSymTab; }
+
/// Get the Module's symbol table for COMDATs (constant).
const ComdatSymTabType &getComdatSymbolTable() const { return ComdatSymTab; }
/// Get the Module's symbol table for COMDATs.
@@ -602,11 +624,11 @@ public:
/// @name Convenience iterators
/// @{
- typedef concat_iterator<GlobalObject, iterator, global_iterator>
- global_object_iterator;
- typedef concat_iterator<const GlobalObject, const_iterator,
- const_global_iterator>
- const_global_object_iterator;
+ using global_object_iterator =
+ concat_iterator<GlobalObject, iterator, global_iterator>;
+ using const_global_object_iterator =
+ concat_iterator<const GlobalObject, const_iterator,
+ const_global_iterator>;
iterator_range<global_object_iterator> global_objects() {
return concat<GlobalObject>(functions(), globals());
@@ -627,13 +649,12 @@ public:
return global_objects().end();
}
- typedef concat_iterator<GlobalValue, iterator, global_iterator,
- alias_iterator, ifunc_iterator>
- global_value_iterator;
- typedef concat_iterator<const GlobalValue, const_iterator,
- const_global_iterator, const_alias_iterator,
- const_ifunc_iterator>
- const_global_value_iterator;
+ using global_value_iterator =
+ concat_iterator<GlobalValue, iterator, global_iterator, alias_iterator,
+ ifunc_iterator>;
+ using const_global_value_iterator =
+ concat_iterator<const GlobalValue, const_iterator, const_global_iterator,
+ const_alias_iterator, const_ifunc_iterator>;
iterator_range<global_value_iterator> global_values() {
return concat<GlobalValue>(functions(), globals(), aliases(), ifuncs());
@@ -682,28 +703,35 @@ public:
: public std::iterator<std::input_iterator_tag, DICompileUnit *> {
NamedMDNode *CUs;
unsigned Idx;
+
void SkipNoDebugCUs();
+
public:
explicit debug_compile_units_iterator(NamedMDNode *CUs, unsigned Idx)
: CUs(CUs), Idx(Idx) {
SkipNoDebugCUs();
}
+
debug_compile_units_iterator &operator++() {
++Idx;
SkipNoDebugCUs();
return *this;
}
+
debug_compile_units_iterator operator++(int) {
debug_compile_units_iterator T(*this);
++Idx;
return T;
}
+
bool operator==(const debug_compile_units_iterator &I) const {
return Idx == I.Idx;
}
+
bool operator!=(const debug_compile_units_iterator &I) const {
return Idx != I.Idx;
}
+
DICompileUnit *operator*() const;
DICompileUnit *operator->() const;
};
@@ -833,6 +861,6 @@ inline Module *unwrap(LLVMModuleProviderRef MP) {
return reinterpret_cast<Module*>(MP);
}
-} // End llvm namespace
+} // end namespace llvm
-#endif
+#endif // LLVM_IR_MODULE_H
diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index 53570bdf16f4..c46c609609e2 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -1,4 +1,4 @@
-//===-- llvm/ModuleSummaryIndex.h - Module Summary Index --------*- C++ -*-===//
+//===- llvm/ModuleSummaryIndex.h - Module Summary Index ---------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -16,21 +16,33 @@
#ifndef LLVM_IR_MODULESUMMARYINDEX_H
#define LLVM_IR_MODULESUMMARYINDEX_H
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Module.h"
-
+#include "llvm/IR/GlobalValue.h"
+#include <algorithm>
#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
namespace llvm {
namespace yaml {
+
template <typename T> struct MappingTraits;
-}
+
+} // end namespace yaml
/// \brief Class to accumulate and hold information about a callee.
struct CalleeInfo {
@@ -47,7 +59,7 @@ struct CalleeInfo {
class GlobalValueSummary;
-typedef std::vector<std::unique_ptr<GlobalValueSummary>> GlobalValueSummaryList;
+using GlobalValueSummaryList = std::vector<std::unique_ptr<GlobalValueSummary>>;
struct GlobalValueSummaryInfo {
/// The GlobalValue corresponding to this summary. This is only used in
@@ -66,19 +78,22 @@ struct GlobalValueSummaryInfo {
/// likely incur less overhead, as the value type is not very small and the size
/// of the map is unknown, resulting in inefficiencies due to repeated
/// insertions and resizing.
-typedef std::map<GlobalValue::GUID, GlobalValueSummaryInfo>
- GlobalValueSummaryMapTy;
+using GlobalValueSummaryMapTy =
+ std::map<GlobalValue::GUID, GlobalValueSummaryInfo>;
/// Struct that holds a reference to a particular GUID in a global value
/// summary.
struct ValueInfo {
const GlobalValueSummaryMapTy::value_type *Ref = nullptr;
+
ValueInfo() = default;
ValueInfo(const GlobalValueSummaryMapTy::value_type *Ref) : Ref(Ref) {}
+
operator bool() const { return Ref; }
GlobalValue::GUID getGUID() const { return Ref->first; }
const GlobalValue *getValue() const { return Ref->second.GV; }
+
ArrayRef<std::unique_ptr<GlobalValueSummary>> getSummaryList() const {
return Ref->second.SummaryList;
}
@@ -88,9 +103,11 @@ template <> struct DenseMapInfo<ValueInfo> {
static inline ValueInfo getEmptyKey() {
return ValueInfo((GlobalValueSummaryMapTy::value_type *)-1);
}
+
static inline ValueInfo getTombstoneKey() {
return ValueInfo((GlobalValueSummaryMapTy::value_type *)-2);
}
+
static bool isEqual(ValueInfo L, ValueInfo R) { return L.Ref == R.Ref; }
static unsigned getHashValue(ValueInfo I) { return (uintptr_t)I.Ref; }
};
@@ -138,7 +155,7 @@ private:
/// This is the hash of the name of the symbol in the original file. It is
/// identical to the GUID for global symbols, but differs for local since the
/// GUID includes the module level id in the hash.
- GlobalValue::GUID OriginalName;
+ GlobalValue::GUID OriginalName = 0;
/// \brief Path of module IR containing value's definition, used to locate
/// module during importing.
@@ -157,7 +174,7 @@ private:
protected:
GlobalValueSummary(SummaryKind K, GVFlags Flags, std::vector<ValueInfo> Refs)
- : Kind(K), Flags(Flags), OriginalName(0), RefEdgeList(std::move(Refs)) {}
+ : Kind(K), Flags(Flags), RefEdgeList(std::move(Refs)) {}
public:
virtual ~GlobalValueSummary() = default;
@@ -242,7 +259,7 @@ public:
class FunctionSummary : public GlobalValueSummary {
public:
/// <CalleeValueInfo, CalleeInfo> call edge pair.
- typedef std::pair<ValueInfo, CalleeInfo> EdgeTy;
+ using EdgeTy = std::pair<ValueInfo, CalleeInfo>;
/// An "identifier" for a virtual function. This contains the type identifier
/// represented as a GUID and the offset from the address point to the virtual
@@ -376,12 +393,15 @@ public:
template <> struct DenseMapInfo<FunctionSummary::VFuncId> {
static FunctionSummary::VFuncId getEmptyKey() { return {0, uint64_t(-1)}; }
+
static FunctionSummary::VFuncId getTombstoneKey() {
return {0, uint64_t(-2)};
}
+
static bool isEqual(FunctionSummary::VFuncId L, FunctionSummary::VFuncId R) {
return L.GUID == R.GUID && L.Offset == R.Offset;
}
+
static unsigned getHashValue(FunctionSummary::VFuncId I) { return I.GUID; }
};
@@ -389,14 +409,17 @@ template <> struct DenseMapInfo<FunctionSummary::ConstVCall> {
static FunctionSummary::ConstVCall getEmptyKey() {
return {{0, uint64_t(-1)}, {}};
}
+
static FunctionSummary::ConstVCall getTombstoneKey() {
return {{0, uint64_t(-2)}, {}};
}
+
static bool isEqual(FunctionSummary::ConstVCall L,
FunctionSummary::ConstVCall R) {
return DenseMapInfo<FunctionSummary::VFuncId>::isEqual(L.VFunc, R.VFunc) &&
L.Args == R.Args;
}
+
static unsigned getHashValue(FunctionSummary::ConstVCall I) {
return I.VFunc.GUID;
}
@@ -477,20 +500,20 @@ struct TypeIdSummary {
};
/// 160 bits SHA1
-typedef std::array<uint32_t, 5> ModuleHash;
+using ModuleHash = std::array<uint32_t, 5>;
/// Type used for iterating through the global value summary map.
-typedef GlobalValueSummaryMapTy::const_iterator const_gvsummary_iterator;
-typedef GlobalValueSummaryMapTy::iterator gvsummary_iterator;
+using const_gvsummary_iterator = GlobalValueSummaryMapTy::const_iterator;
+using gvsummary_iterator = GlobalValueSummaryMapTy::iterator;
/// String table to hold/own module path strings, which additionally holds the
/// module ID assigned to each module during the plugin step, as well as a hash
/// of the module. The StringMap makes a copy of and owns inserted strings.
-typedef StringMap<std::pair<uint64_t, ModuleHash>> ModulePathStringTableTy;
+using ModulePathStringTableTy = StringMap<std::pair<uint64_t, ModuleHash>>;
/// Map of global value GUID to its summary, used to identify values defined in
/// a particular module, and provide efficient access to their summary.
-typedef std::map<GlobalValue::GUID, GlobalValueSummary *> GVSummaryMapTy;
+using GVSummaryMapTy = std::map<GlobalValue::GUID, GlobalValueSummary *>;
/// Class to hold module path string table and global value map,
/// and encapsulate methods for operating on them.
@@ -697,6 +720,6 @@ public:
StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries) const;
};
-} // End llvm namespace
+} // end namespace llvm
-#endif
+#endif // LLVM_IR_MODULESUMMARYINDEX_H
diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
index c845112baa45..d03b7b65f81e 100644
--- a/include/llvm/IR/PassManager.h
+++ b/include/llvm/IR/PassManager.h
@@ -39,8 +39,8 @@
#define LLVM_IR_PASSMANAGER_H
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/TinyPtrVector.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
@@ -48,9 +48,15 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/TypeName.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/type_traits.h"
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <iterator>
#include <list>
#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
#include <vector>
namespace llvm {
@@ -469,15 +475,16 @@ public:
}
template <typename PassT> void addPass(PassT Pass) {
- typedef detail::PassModel<IRUnitT, PassT, PreservedAnalyses,
- AnalysisManagerT, ExtraArgTs...>
- PassModelT;
+ using PassModelT =
+ detail::PassModel<IRUnitT, PassT, PreservedAnalyses, AnalysisManagerT,
+ ExtraArgTs...>;
+
Passes.emplace_back(new PassModelT(std::move(Pass)));
}
private:
- typedef detail::PassConcept<IRUnitT, AnalysisManagerT, ExtraArgTs...>
- PassConceptT;
+ using PassConceptT =
+ detail::PassConcept<IRUnitT, AnalysisManagerT, ExtraArgTs...>;
std::vector<std::unique_ptr<PassConceptT>> Passes;
@@ -486,12 +493,14 @@ private:
};
extern template class PassManager<Module>;
+
/// \brief Convenience typedef for a pass manager over modules.
-typedef PassManager<Module> ModulePassManager;
+using ModulePassManager = PassManager<Module>;
extern template class PassManager<Function>;
+
/// \brief Convenience typedef for a pass manager over functions.
-typedef PassManager<Function> FunctionPassManager;
+using FunctionPassManager = PassManager<Function>;
/// \brief A container for analyses that lazily runs them and caches their
/// results.
@@ -504,11 +513,11 @@ public:
private:
// Now that we've defined our invalidator, we can define the concept types.
- typedef detail::AnalysisResultConcept<IRUnitT, PreservedAnalyses, Invalidator>
- ResultConceptT;
- typedef detail::AnalysisPassConcept<IRUnitT, PreservedAnalyses, Invalidator,
- ExtraArgTs...>
- PassConceptT;
+ using ResultConceptT =
+ detail::AnalysisResultConcept<IRUnitT, PreservedAnalyses, Invalidator>;
+ using PassConceptT =
+ detail::AnalysisPassConcept<IRUnitT, PreservedAnalyses, Invalidator,
+ ExtraArgTs...>;
/// \brief List of analysis pass IDs and associated concept pointers.
///
@@ -516,18 +525,18 @@ private:
/// erases. Provides the analysis ID to enable finding iterators to a given
/// entry in maps below, and provides the storage for the actual result
/// concept.
- typedef std::list<std::pair<AnalysisKey *, std::unique_ptr<ResultConceptT>>>
- AnalysisResultListT;
+ using AnalysisResultListT =
+ std::list<std::pair<AnalysisKey *, std::unique_ptr<ResultConceptT>>>;
/// \brief Map type from IRUnitT pointer to our custom list type.
- typedef DenseMap<IRUnitT *, AnalysisResultListT> AnalysisResultListMapT;
+ using AnalysisResultListMapT = DenseMap<IRUnitT *, AnalysisResultListT>;
/// \brief Map type from a pair of analysis ID and IRUnitT pointer to an
/// iterator into a particular result list (which is where the actual analysis
/// result is stored).
- typedef DenseMap<std::pair<AnalysisKey *, IRUnitT *>,
- typename AnalysisResultListT::iterator>
- AnalysisResultMapT;
+ using AnalysisResultMapT =
+ DenseMap<std::pair<AnalysisKey *, IRUnitT *>,
+ typename AnalysisResultListT::iterator>;
public:
/// API to communicate dependencies between analyses during invalidation.
@@ -558,10 +567,10 @@ public:
/// dependecies on it will become invalid as a result.
template <typename PassT>
bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA) {
- typedef detail::AnalysisResultModel<IRUnitT, PassT,
- typename PassT::Result,
- PreservedAnalyses, Invalidator>
- ResultModelT;
+ using ResultModelT =
+ detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
+ PreservedAnalyses, Invalidator>;
+
return invalidateImpl<ResultModelT>(PassT::ID(), IR, PA);
}
@@ -672,9 +681,11 @@ public:
"This analysis pass was not registered prior to being queried");
ResultConceptT &ResultConcept =
getResultImpl(PassT::ID(), IR, ExtraArgs...);
- typedef detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
- PreservedAnalyses, Invalidator>
- ResultModelT;
+
+ using ResultModelT =
+ detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
+ PreservedAnalyses, Invalidator>;
+
return static_cast<ResultModelT &>(ResultConcept).Result;
}
@@ -692,9 +703,10 @@ public:
if (!ResultConcept)
return nullptr;
- typedef detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
- PreservedAnalyses, Invalidator>
- ResultModelT;
+ using ResultModelT =
+ detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
+ PreservedAnalyses, Invalidator>;
+
return &static_cast<ResultModelT *>(ResultConcept)->Result;
}
@@ -717,10 +729,10 @@ public:
/// hashtable.)
template <typename PassBuilderT>
bool registerPass(PassBuilderT &&PassBuilder) {
- typedef decltype(PassBuilder()) PassT;
- typedef detail::AnalysisPassModel<IRUnitT, PassT, PreservedAnalyses,
- Invalidator, ExtraArgTs...>
- PassModelT;
+ using PassT = decltype(PassBuilder());
+ using PassModelT =
+ detail::AnalysisPassModel<IRUnitT, PassT, PreservedAnalyses,
+ Invalidator, ExtraArgTs...>;
auto &PassPtr = AnalysisPasses[PassT::ID()];
if (PassPtr)
@@ -876,7 +888,8 @@ private:
}
/// \brief Map type from module analysis pass ID to pass concept pointer.
- typedef DenseMap<AnalysisKey *, std::unique_ptr<PassConceptT>> AnalysisPassMapT;
+ using AnalysisPassMapT =
+ DenseMap<AnalysisKey *, std::unique_ptr<PassConceptT>>;
/// \brief Collection of module analysis passes, indexed by ID.
AnalysisPassMapT AnalysisPasses;
@@ -896,12 +909,14 @@ private:
};
extern template class AnalysisManager<Module>;
+
/// \brief Convenience typedef for the Module analysis manager.
-typedef AnalysisManager<Module> ModuleAnalysisManager;
+using ModuleAnalysisManager = AnalysisManager<Module>;
extern template class AnalysisManager<Function>;
+
/// \brief Convenience typedef for the Function analysis manager.
-typedef AnalysisManager<Function> FunctionAnalysisManager;
+using FunctionAnalysisManager = AnalysisManager<Function>;
/// \brief An analysis over an "outer" IR unit that provides access to an
/// analysis manager over an "inner" IR unit. The inner unit must be contained
@@ -927,20 +942,14 @@ public:
class Result {
public:
explicit Result(AnalysisManagerT &InnerAM) : InnerAM(&InnerAM) {}
+
Result(Result &&Arg) : InnerAM(std::move(Arg.InnerAM)) {
// We have to null out the analysis manager in the moved-from state
// because we are taking ownership of the responsibilty to clear the
// analysis state.
Arg.InnerAM = nullptr;
}
- Result &operator=(Result &&RHS) {
- InnerAM = RHS.InnerAM;
- // We have to null out the analysis manager in the moved-from state
- // because we are taking ownership of the responsibilty to clear the
- // analysis state.
- RHS.InnerAM = nullptr;
- return *this;
- }
+
~Result() {
// InnerAM is cleared in a moved from state where there is nothing to do.
if (!InnerAM)
@@ -951,6 +960,15 @@ public:
InnerAM->clear();
}
+ Result &operator=(Result &&RHS) {
+ InnerAM = RHS.InnerAM;
+ // We have to null out the analysis manager in the moved-from state
+ // because we are taking ownership of the responsibilty to clear the
+ // analysis state.
+ RHS.InnerAM = nullptr;
+ return *this;
+ }
+
/// \brief Accessor for the analysis manager.
AnalysisManagerT &getManager() { return *InnerAM; }
@@ -988,6 +1006,7 @@ public:
private:
friend AnalysisInfoMixin<
InnerAnalysisManagerProxy<AnalysisManagerT, IRUnitT>>;
+
static AnalysisKey Key;
AnalysisManagerT *InnerAM;
@@ -998,8 +1017,8 @@ AnalysisKey
InnerAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>::Key;
/// Provide the \c FunctionAnalysisManager to \c Module proxy.
-typedef InnerAnalysisManagerProxy<FunctionAnalysisManager, Module>
- FunctionAnalysisManagerModuleProxy;
+using FunctionAnalysisManagerModuleProxy =
+ InnerAnalysisManagerProxy<FunctionAnalysisManager, Module>;
/// Specialization of the invalidate method for the \c
/// FunctionAnalysisManagerModuleProxy's result.
@@ -1097,6 +1116,7 @@ public:
private:
friend AnalysisInfoMixin<
OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>>;
+
static AnalysisKey Key;
const AnalysisManagerT *AM;
@@ -1109,8 +1129,8 @@ AnalysisKey
extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
Function>;
/// Provide the \c ModuleAnalysisManager to \c Function proxy.
-typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>
- ModuleAnalysisManagerFunctionProxy;
+using ModuleAnalysisManagerFunctionProxy =
+ OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>;
/// \brief Trivial adaptor that maps from a module to its functions.
///
@@ -1274,6 +1294,6 @@ RepeatedPass<PassT> createRepeatedPass(int Count, PassT P) {
return RepeatedPass<PassT>(Count, std::move(P));
}
-}
+} // end namespace llvm
-#endif
+#endif // LLVM_IR_PASSMANAGER_H
diff --git a/include/llvm/IR/PassManagerInternal.h b/include/llvm/IR/PassManagerInternal.h
index 387dc4c65c43..9195d4dfa428 100644
--- a/include/llvm/IR/PassManagerInternal.h
+++ b/include/llvm/IR/PassManagerInternal.h
@@ -27,7 +27,6 @@ namespace llvm {
template <typename IRUnitT> class AllAnalysesOn;
template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
-class Invalidator;
class PreservedAnalyses;
/// \brief Implementation details of the pass manager interfaces.
@@ -116,7 +115,7 @@ struct AnalysisResultConcept {
/// \brief SFINAE metafunction for computing whether \c ResultT provides an
/// \c invalidate member function.
template <typename IRUnitT, typename ResultT> class ResultHasInvalidateMethod {
- typedef char EnabledType;
+ using EnabledType = char;
struct DisabledType {
char a, b;
};
@@ -124,7 +123,7 @@ template <typename IRUnitT, typename ResultT> class ResultHasInvalidateMethod {
// Purely to help out MSVC which fails to disable the below specialization,
// explicitly enable using the result type's invalidate routine if we can
// successfully call that routine.
- template <typename T> struct Nonce { typedef EnabledType Type; };
+ template <typename T> struct Nonce { using Type = EnabledType; };
template <typename T>
static typename Nonce<decltype(std::declval<T>().invalidate(
std::declval<IRUnitT &>(), std::declval<PreservedAnalyses>()))>::Type
@@ -280,9 +279,9 @@ struct AnalysisPassModel : AnalysisPassConcept<IRUnitT, PreservedAnalysesT,
}
// FIXME: Replace PassT::Result with type traits when we use C++11.
- typedef AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
- PreservedAnalysesT, InvalidatorT>
- ResultModelT;
+ using ResultModelT =
+ AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
+ PreservedAnalysesT, InvalidatorT>;
/// \brief The model delegates to the \c PassT::run method.
///
diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h
index 31a76b4ed6c3..6b2b22e82b95 100644
--- a/include/llvm/IR/PatternMatch.h
+++ b/include/llvm/IR/PatternMatch.h
@@ -29,11 +29,19 @@
#ifndef LLVM_IR_PATTERNMATCH_H
#define LLVM_IR_PATTERNMATCH_H
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <cstdint>
namespace llvm {
namespace PatternMatch {
@@ -172,7 +180,9 @@ inline match_nan m_NaN() { return match_nan(); }
struct apint_match {
const APInt *&Res;
+
apint_match(const APInt *&R) : Res(R) {}
+
template <typename ITy> bool match(ITy *V) {
if (auto *CI = dyn_cast<ConstantInt>(V)) {
Res = &CI->getValue();
@@ -230,7 +240,9 @@ template <typename Predicate> struct cst_pred_ty : public Predicate {
/// satisfy a specified predicate, and bind them to an APInt.
template <typename Predicate> struct api_pred_ty : public Predicate {
const APInt *&Res;
+
api_pred_ty(const APInt *&R) : Res(R) {}
+
template <typename ITy> bool match(ITy *V) {
if (const auto *CI = dyn_cast<ConstantInt>(V))
if (this->isValue(CI->getValue())) {
@@ -294,6 +306,7 @@ inline api_pred_ty<is_maxsignedvalue> m_MaxSignedValue(const APInt *&V) { return
template <typename Class> struct bind_ty {
Class *&VR;
+
bind_ty(Class *&V) : VR(V) {}
template <typename ITy> bool match(ITy *V) {
@@ -326,6 +339,7 @@ inline bind_ty<ConstantFP> m_ConstantFP(ConstantFP *&C) { return C; }
/// \brief Match a specified Value*.
struct specificval_ty {
const Value *Val;
+
specificval_ty(const Value *V) : Val(V) {}
template <typename ITy> bool match(ITy *V) { return V == Val; }
@@ -338,6 +352,7 @@ inline specificval_ty m_Specific(const Value *V) { return V; }
/// that value.
struct specific_fpval {
double Val;
+
specific_fpval(double V) : Val(V) {}
template <typename ITy> bool match(ITy *V) {
@@ -360,6 +375,7 @@ inline specific_fpval m_FPOne() { return m_SpecificFP(1.0); }
struct bind_const_intval_ty {
uint64_t &VR;
+
bind_const_intval_ty(uint64_t &V) : VR(V) {}
template <typename ITy> bool match(ITy *V) {
@@ -376,6 +392,7 @@ struct bind_const_intval_ty {
// value.
struct specific_intval {
uint64_t Val;
+
specific_intval(uint64_t V) : Val(V) {}
template <typename ITy> bool match(ITy *V) {
@@ -939,6 +956,7 @@ template <typename LHS> inline fneg_match<LHS> m_FNeg(const LHS &L) {
struct br_match {
BasicBlock *&Succ;
+
br_match(BasicBlock *&Succ) : Succ(Succ) {}
template <typename OpTy> bool match(OpTy *V) {
@@ -956,6 +974,7 @@ inline br_match m_UnconditionalBr(BasicBlock *&Succ) { return br_match(Succ); }
template <typename Cond_t> struct brc_match {
Cond_t Cond;
BasicBlock *&T, *&F;
+
brc_match(const Cond_t &C, BasicBlock *&t, BasicBlock *&f)
: Cond(C), T(t), F(f) {}
@@ -1202,6 +1221,7 @@ m_UnordFMin(const LHS &L, const RHS &R) {
template <typename Opnd_t> struct Argument_match {
unsigned OpI;
Opnd_t Val;
+
Argument_match(unsigned OpIdx, const Opnd_t &V) : OpI(OpIdx), Val(V) {}
template <typename OpTy> bool match(OpTy *V) {
@@ -1219,6 +1239,7 @@ inline Argument_match<Opnd_t> m_Argument(const Opnd_t &Op) {
/// \brief Intrinsic matchers.
struct IntrinsicID_match {
unsigned ID;
+
IntrinsicID_match(Intrinsic::ID IntrID) : ID(IntrID) {}
template <typename OpTy> bool match(OpTy *V) {
@@ -1239,21 +1260,23 @@ template <typename T0 = void, typename T1 = void, typename T2 = void,
typename T9 = void, typename T10 = void>
struct m_Intrinsic_Ty;
template <typename T0> struct m_Intrinsic_Ty<T0> {
- typedef match_combine_and<IntrinsicID_match, Argument_match<T0>> Ty;
+ using Ty = match_combine_and<IntrinsicID_match, Argument_match<T0>>;
};
template <typename T0, typename T1> struct m_Intrinsic_Ty<T0, T1> {
- typedef match_combine_and<typename m_Intrinsic_Ty<T0>::Ty, Argument_match<T1>>
- Ty;
+ using Ty =
+ match_combine_and<typename m_Intrinsic_Ty<T0>::Ty, Argument_match<T1>>;
};
template <typename T0, typename T1, typename T2>
struct m_Intrinsic_Ty<T0, T1, T2> {
- typedef match_combine_and<typename m_Intrinsic_Ty<T0, T1>::Ty,
- Argument_match<T2>> Ty;
+ using Ty =
+ match_combine_and<typename m_Intrinsic_Ty<T0, T1>::Ty,
+ Argument_match<T2>>;
};
template <typename T0, typename T1, typename T2, typename T3>
struct m_Intrinsic_Ty<T0, T1, T2, T3> {
- typedef match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2>::Ty,
- Argument_match<T3>> Ty;
+ using Ty =
+ match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2>::Ty,
+ Argument_match<T3>>;
};
/// \brief Match intrinsic calls like this:
@@ -1437,4 +1460,4 @@ m_c_UMax(const LHS &L, const RHS &R) {
} // end namespace PatternMatch
} // end namespace llvm
-#endif
+#endif // LLVM_IR_PATTERNMATCH_H
diff --git a/include/llvm/IR/ProfileSummary.h b/include/llvm/IR/ProfileSummary.h
index f4248014c6e1..d85ce8c443ec 100644
--- a/include/llvm/IR/ProfileSummary.h
+++ b/include/llvm/IR/ProfileSummary.h
@@ -1,4 +1,4 @@
-//===-- ProfileSummary.h - Profile summary data structure. ------*- C++ -*-===//
+//===- ProfileSummary.h - Profile summary data structure. -------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,21 +11,17 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_SUPPORT_PROFILE_SUMMARY_H
-#define LLVM_SUPPORT_PROFILE_SUMMARY_H
+#ifndef LLVM_IR_PROFILESUMMARY_H
+#define LLVM_IR_PROFILESUMMARY_H
+#include <algorithm>
#include <cstdint>
-#include <utility>
#include <vector>
-#include "llvm/Support/Casting.h"
-
namespace llvm {
class LLVMContext;
class Metadata;
-class MDTuple;
-class MDNode;
// The profile summary is one or more (Cutoff, MinCount, NumCounts) triplets.
// The semantics of counts depend on the type of profile. For instrumentation
@@ -37,12 +33,13 @@ struct ProfileSummaryEntry {
uint32_t Cutoff; ///< The required percentile of counts.
uint64_t MinCount; ///< The minimum count for this percentile.
uint64_t NumCounts; ///< Number of counts >= the minimum count.
+
ProfileSummaryEntry(uint32_t TheCutoff, uint64_t TheMinCount,
uint64_t TheNumCounts)
: Cutoff(TheCutoff), MinCount(TheMinCount), NumCounts(TheNumCounts) {}
};
-typedef std::vector<ProfileSummaryEntry> SummaryEntryVector;
+using SummaryEntryVector = std::vector<ProfileSummaryEntry>;
class ProfileSummary {
public:
@@ -59,6 +56,7 @@ private:
public:
static const int Scale = 1000000;
+
ProfileSummary(Kind K, SummaryEntryVector DetailedSummary,
uint64_t TotalCount, uint64_t MaxCount,
uint64_t MaxInternalCount, uint64_t MaxFunctionCount,
@@ -67,6 +65,7 @@ public:
TotalCount(TotalCount), MaxCount(MaxCount),
MaxInternalCount(MaxInternalCount), MaxFunctionCount(MaxFunctionCount),
NumCounts(NumCounts), NumFunctions(NumFunctions) {}
+
Kind getKind() const { return PSK; }
/// \brief Return summary information as metadata.
Metadata *getMD(LLVMContext &Context);
@@ -82,4 +81,5 @@ public:
};
} // end namespace llvm
-#endif
+
+#endif // LLVM_IR_PROFILESUMMARY_H
diff --git a/include/llvm/IR/Statepoint.h b/include/llvm/IR/Statepoint.h
index 03151cd7c8f7..f01607614a0c 100644
--- a/include/llvm/IR/Statepoint.h
+++ b/include/llvm/IR/Statepoint.h
@@ -1,4 +1,4 @@
-//===-- llvm/IR/Statepoint.h - gc.statepoint utilities ----------*- C++ -*-===//
+//===- llvm/IR/Statepoint.h - gc.statepoint utilities -----------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -24,10 +24,12 @@
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
@@ -87,7 +89,7 @@ protected:
}
public:
- typedef typename CallSiteTy::arg_iterator arg_iterator;
+ using arg_iterator = typename CallSiteTy::arg_iterator;
enum {
IDPos = 0,
@@ -300,8 +302,9 @@ public:
class ImmutableStatepoint
: public StatepointBase<const Function, const Instruction, const Value,
ImmutableCallSite> {
- typedef StatepointBase<const Function, const Instruction, const Value,
- ImmutableCallSite> Base;
+ using Base =
+ StatepointBase<const Function, const Instruction, const Value,
+ ImmutableCallSite>;
public:
explicit ImmutableStatepoint(const Instruction *I) : Base(I) {}
@@ -312,7 +315,7 @@ public:
/// to a gc.statepoint.
class Statepoint
: public StatepointBase<Function, Instruction, Value, CallSite> {
- typedef StatepointBase<Function, Instruction, Value, CallSite> Base;
+ using Base = StatepointBase<Function, Instruction, Value, CallSite>;
public:
explicit Statepoint(Instruction *I) : Base(I) {}
@@ -327,6 +330,7 @@ public:
return I->getIntrinsicID() == Intrinsic::experimental_gc_relocate ||
I->getIntrinsicID() == Intrinsic::experimental_gc_result;
}
+
static inline bool classof(const Value *V) {
return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
}
@@ -369,6 +373,7 @@ public:
static inline bool classof(const IntrinsicInst *I) {
return I->getIntrinsicID() == Intrinsic::experimental_gc_relocate;
}
+
static inline bool classof(const Value *V) {
return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
}
@@ -403,6 +408,7 @@ public:
static inline bool classof(const IntrinsicInst *I) {
return I->getIntrinsicID() == Intrinsic::experimental_gc_result;
}
+
static inline bool classof(const Value *V) {
return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
}
diff --git a/include/llvm/IR/SymbolTableListTraits.h b/include/llvm/IR/SymbolTableListTraits.h
index 49a5fb21297d..87ce902c2811 100644
--- a/include/llvm/IR/SymbolTableListTraits.h
+++ b/include/llvm/IR/SymbolTableListTraits.h
@@ -48,7 +48,7 @@ class ValueSymbolTable;
template <typename NodeTy> struct SymbolTableListParentType {};
#define DEFINE_SYMBOL_TABLE_PARENT_TYPE(NODE, PARENT) \
- template <> struct SymbolTableListParentType<NODE> { typedef PARENT type; };
+ template <> struct SymbolTableListParentType<NODE> { using type = PARENT; };
DEFINE_SYMBOL_TABLE_PARENT_TYPE(Instruction, BasicBlock)
DEFINE_SYMBOL_TABLE_PARENT_TYPE(BasicBlock, Function)
DEFINE_SYMBOL_TABLE_PARENT_TYPE(Argument, Function)
@@ -65,10 +65,10 @@ template <typename NodeTy> class SymbolTableList;
//
template <typename ValueSubClass>
class SymbolTableListTraits : public ilist_alloc_traits<ValueSubClass> {
- typedef SymbolTableList<ValueSubClass> ListTy;
- typedef typename simple_ilist<ValueSubClass>::iterator iterator;
- typedef
- typename SymbolTableListParentType<ValueSubClass>::type ItemParentClass;
+ using ListTy = SymbolTableList<ValueSubClass>;
+ using iterator = typename simple_ilist<ValueSubClass>::iterator;
+ using ItemParentClass =
+ typename SymbolTableListParentType<ValueSubClass>::type;
public:
SymbolTableListTraits() = default;
diff --git a/include/llvm/IR/TrackingMDRef.h b/include/llvm/IR/TrackingMDRef.h
index 12b196432006..bdec904ad1e1 100644
--- a/include/llvm/IR/TrackingMDRef.h
+++ b/include/llvm/IR/TrackingMDRef.h
@@ -139,31 +139,35 @@ public:
bool hasTrivialDestructor() const { return Ref.hasTrivialDestructor(); }
};
-typedef TypedTrackingMDRef<MDNode> TrackingMDNodeRef;
-typedef TypedTrackingMDRef<ValueAsMetadata> TrackingValueAsMetadataRef;
+using TrackingMDNodeRef = TypedTrackingMDRef<MDNode>;
+using TrackingValueAsMetadataRef = TypedTrackingMDRef<ValueAsMetadata>;
// Expose the underlying metadata to casting.
template <> struct simplify_type<TrackingMDRef> {
- typedef Metadata *SimpleType;
+ using SimpleType = Metadata *;
+
static SimpleType getSimplifiedValue(TrackingMDRef &MD) { return MD.get(); }
};
template <> struct simplify_type<const TrackingMDRef> {
- typedef Metadata *SimpleType;
+ using SimpleType = Metadata *;
+
static SimpleType getSimplifiedValue(const TrackingMDRef &MD) {
return MD.get();
}
};
template <class T> struct simplify_type<TypedTrackingMDRef<T>> {
- typedef T *SimpleType;
+ using SimpleType = T *;
+
static SimpleType getSimplifiedValue(TypedTrackingMDRef<T> &MD) {
return MD.get();
}
};
template <class T> struct simplify_type<const TypedTrackingMDRef<T>> {
- typedef T *SimpleType;
+ using SimpleType = T *;
+
static SimpleType getSimplifiedValue(const TypedTrackingMDRef<T> &MD) {
return MD.get();
}
diff --git a/include/llvm/IR/Type.h b/include/llvm/IR/Type.h
index e6a0df937e9b..82362107e41e 100644
--- a/include/llvm/IR/Type.h
+++ b/include/llvm/IR/Type.h
@@ -1,4 +1,4 @@
-//===-- llvm/Type.h - Classes for handling data types -----------*- C++ -*-===//
+//===- llvm/Type.h - Classes for handling data types ------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -18,21 +18,22 @@
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Support/CBindingWrapping.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
namespace llvm {
-class PointerType;
+template<class GraphType> struct GraphTraits;
class IntegerType;
-class raw_ostream;
-class Module;
class LLVMContext;
-class LLVMContextImpl;
+class PointerType;
+class raw_ostream;
class StringRef;
-template<class GraphType> struct GraphTraits;
/// The instances of the Type class are immutable: once they are created,
/// they are never changed. Also note that only one instance of a particular
@@ -86,9 +87,9 @@ private:
protected:
friend class LLVMContextImpl;
+
explicit Type(LLVMContext &C, TypeID tid)
- : Context(C), ID(tid), SubclassData(0),
- NumContainedTys(0), ContainedTys(nullptr) {}
+ : Context(C), ID(tid), SubclassData(0) {}
~Type() = default;
unsigned getSubclassData() const { return SubclassData; }
@@ -100,14 +101,14 @@ protected:
}
/// Keeps track of how many Type*'s there are in the ContainedTys list.
- unsigned NumContainedTys;
+ unsigned NumContainedTys = 0;
/// A pointer to the array of Types contained by this Type. For example, this
/// includes the arguments of a function type, the elements of a structure,
/// the pointee of a pointer, the element type of an array, etc. This pointer
/// may be 0 for types that don't contain other types (Integer, Double,
/// Float).
- Type * const *ContainedTys;
+ Type * const *ContainedTys = nullptr;
static bool isSequentialType(TypeID TyID) {
return TyID == ArrayTyID || TyID == VectorTyID;
@@ -122,6 +123,7 @@ public:
/// inlined with the operands when printing an instruction.
void print(raw_ostream &O, bool IsForDebug = false,
bool NoDetails = false) const;
+
void dump() const;
/// Return the LLVMContext in which this type was uniqued.
@@ -299,14 +301,16 @@ public:
//===--------------------------------------------------------------------===//
// Type Iteration support.
//
- typedef Type * const *subtype_iterator;
+ using subtype_iterator = Type * const *;
+
subtype_iterator subtype_begin() const { return ContainedTys; }
subtype_iterator subtype_end() const { return &ContainedTys[NumContainedTys];}
ArrayRef<Type*> subtypes() const {
return makeArrayRef(subtype_begin(), subtype_end());
}
- typedef std::reverse_iterator<subtype_iterator> subtype_reverse_iterator;
+ using subtype_reverse_iterator = std::reverse_iterator<subtype_iterator>;
+
subtype_reverse_iterator subtype_rbegin() const {
return subtype_reverse_iterator(subtype_end());
}
@@ -348,6 +352,7 @@ public:
}
inline uint64_t getArrayNumElements() const;
+
Type *getArrayElementType() const {
assert(getTypeID() == ArrayTyID);
return ContainedTys[0];
@@ -444,8 +449,8 @@ template <> struct isa_impl<PointerType, Type> {
// graph of sub types.
template <> struct GraphTraits<Type *> {
- typedef Type *NodeRef;
- typedef Type::subtype_iterator ChildIteratorType;
+ using NodeRef = Type *;
+ using ChildIteratorType = Type::subtype_iterator;
static NodeRef getEntryNode(Type *T) { return T; }
static ChildIteratorType child_begin(NodeRef N) { return N->subtype_begin(); }
@@ -453,8 +458,8 @@ template <> struct GraphTraits<Type *> {
};
template <> struct GraphTraits<const Type*> {
- typedef const Type *NodeRef;
- typedef Type::subtype_iterator ChildIteratorType;
+ using NodeRef = const Type *;
+ using ChildIteratorType = Type::subtype_iterator;
static NodeRef getEntryNode(NodeRef T) { return T; }
static ChildIteratorType child_begin(NodeRef N) { return N->subtype_begin(); }
@@ -474,6 +479,6 @@ inline LLVMTypeRef *wrap(Type **Tys) {
return reinterpret_cast<LLVMTypeRef*>(const_cast<Type**>(Tys));
}
-} // End llvm namespace
+} // end namespace llvm
-#endif
+#endif // LLVM_IR_TYPE_H
diff --git a/include/llvm/IR/TypeFinder.h b/include/llvm/IR/TypeFinder.h
index 48c4f1161aa1..c050c388d398 100644
--- a/include/llvm/IR/TypeFinder.h
+++ b/include/llvm/IR/TypeFinder.h
@@ -44,8 +44,8 @@ public:
void run(const Module &M, bool onlyNamed);
void clear();
- typedef std::vector<StructType*>::iterator iterator;
- typedef std::vector<StructType*>::const_iterator const_iterator;
+ using iterator = std::vector<StructType*>::iterator;
+ using const_iterator = std::vector<StructType*>::const_iterator;
iterator begin() { return StructTypes.begin(); }
iterator end() { return StructTypes.end(); }
diff --git a/include/llvm/IR/Use.h b/include/llvm/IR/Use.h
index 6b56546f4421..d3a59d8a060e 100644
--- a/include/llvm/IR/Use.h
+++ b/include/llvm/IR/Use.h
@@ -1,4 +1,4 @@
-//===-- llvm/Use.h - Definition of the Use class ----------------*- C++ -*-===//
+//===- llvm/Use.h - Definition of the Use class -----------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -27,14 +27,14 @@
#include "llvm/ADT/PointerIntPair.h"
#include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/Compiler.h"
#include "llvm-c/Types.h"
namespace llvm {
-class Value;
-class User;
-class Use;
template <typename> struct simplify_type;
+class User;
+class Value;
/// \brief A Use represents the edge between a Value definition and its users.
///
@@ -65,23 +65,27 @@ public:
/// use the LSB regardless of pointer alignment on different targets.
struct UserRefPointerTraits {
static inline void *getAsVoidPointer(User *P) { return P; }
+
static inline User *getFromVoidPointer(void *P) {
return (User *)P;
}
+
enum { NumLowBitsAvailable = 1 };
};
// A type for the word following an array of hung-off Uses in memory, which is
// a pointer back to their User with the bottom bit set.
- typedef PointerIntPair<User *, 1, unsigned, UserRefPointerTraits> UserRef;
+ using UserRef = PointerIntPair<User *, 1, unsigned, UserRefPointerTraits>;
/// Pointer traits for the Prev PointerIntPair. This ensures we always use
/// the two LSBs regardless of pointer alignment on different targets.
struct PrevPointerTraits {
static inline void *getAsVoidPointer(Use **P) { return P; }
+
static inline Use **getFromVoidPointer(void *P) {
return (Use **)P;
}
+
enum { NumLowBitsAvailable = 2 };
};
@@ -95,9 +99,11 @@ private:
enum PrevPtrTag { zeroDigitTag, oneDigitTag, stopTag, fullStopTag };
/// Constructor
- Use(PrevPtrTag tag) : Val(nullptr) { Prev.setInt(tag); }
+ Use(PrevPtrTag tag) { Prev.setInt(tag); }
public:
+ friend class Value;
+
operator Value *() const { return Val; }
Value *get() const { return Val; }
@@ -133,7 +139,7 @@ public:
private:
const Use *getImpliedUser() const LLVM_READONLY;
- Value *Val;
+ Value *Val = nullptr;
Use *Next;
PointerIntPair<Use **, 2, PrevPtrTag, PrevPointerTraits> Prev;
@@ -153,18 +159,18 @@ private:
if (Next)
Next->setPrev(StrippedPrev);
}
-
- friend class Value;
};
/// \brief Allow clients to treat uses just like values when using
/// casting operators.
template <> struct simplify_type<Use> {
- typedef Value *SimpleType;
+ using SimpleType = Value *;
+
static SimpleType getSimplifiedValue(Use &Val) { return Val.get(); }
};
template <> struct simplify_type<const Use> {
- typedef /*const*/ Value *SimpleType;
+ using SimpleType = /*const*/ Value *;
+
static SimpleType getSimplifiedValue(const Use &Val) { return Val.get(); }
};
diff --git a/include/llvm/IR/UseListOrder.h b/include/llvm/IR/UseListOrder.h
index ebe99223facd..a8b394fc6302 100644
--- a/include/llvm/IR/UseListOrder.h
+++ b/include/llvm/IR/UseListOrder.h
@@ -37,7 +37,7 @@ struct UseListOrder {
UseListOrder &operator=(UseListOrder &&) = default;
};
-typedef std::vector<UseListOrder> UseListOrderStack;
+using UseListOrderStack = std::vector<UseListOrder>;
} // end namespace llvm
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index 54758a9b6d6a..7b9d451aaf53 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -1,4 +1,4 @@
-//===-- llvm/User.h - User class definition ---------------------*- C++ -*-===//
+//===- llvm/User.h - User class definition ----------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -114,6 +114,7 @@ protected:
? OperandTraits<U>::op_end(const_cast<U*>(that))[Idx]
: OperandTraits<U>::op_begin(const_cast<U*>(that))[Idx];
}
+
template <int Idx> Use &Op() {
return OpFrom<Idx>(this);
}
@@ -205,10 +206,10 @@ public:
// ---------------------------------------------------------------------------
// Operand Iterator interface...
//
- typedef Use* op_iterator;
- typedef const Use* const_op_iterator;
- typedef iterator_range<op_iterator> op_range;
- typedef iterator_range<const_op_iterator> const_op_range;
+ using op_iterator = Use*;
+ using const_op_iterator = const Use*;
+ using op_range = iterator_range<op_iterator>;
+ using const_op_range = iterator_range<const_op_iterator>;
op_iterator op_begin() { return getOperandList(); }
const_op_iterator op_begin() const { return getOperandList(); }
@@ -252,6 +253,7 @@ public:
ptrdiff_t, const Value *, const Value *> {
explicit const_value_op_iterator(const Use *U = nullptr) :
iterator_adaptor_base(U) {}
+
const Value *operator*() const { return *I; }
const Value *operator->() const { return operator*(); }
};
@@ -290,6 +292,7 @@ public:
return isa<Instruction>(V) || isa<Constant>(V);
}
};
+
// Either Use objects, or a Use pointer can be prepended to User.
static_assert(alignof(Use) >= alignof(User),
"Alignment is insufficient after objects prepended to User");
@@ -297,13 +300,15 @@ static_assert(alignof(Use *) >= alignof(User),
"Alignment is insufficient after objects prepended to User");
template<> struct simplify_type<User::op_iterator> {
- typedef Value* SimpleType;
+ using SimpleType = Value*;
+
static SimpleType getSimplifiedValue(User::op_iterator &Val) {
return Val->get();
}
};
template<> struct simplify_type<User::const_op_iterator> {
- typedef /*const*/ Value* SimpleType;
+ using SimpleType = /*const*/ Value*;
+
static SimpleType getSimplifiedValue(User::const_op_iterator &Val) {
return Val->get();
}
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 00f821399257..96a370dcc35f 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -1,4 +1,4 @@
-//===-- llvm/Value.h - Definition of the Value class ------------*- C++ -*-===//
+//===- llvm/Value.h - Definition of the Value class -------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -44,12 +44,12 @@ class LLVMContext;
class Module;
class ModuleSlotTracker;
class raw_ostream;
+template<typename ValueTy> class StringMapEntry;
class StringRef;
class Twine;
class Type;
-template<typename ValueTy> class StringMapEntry;
-typedef StringMapEntry<Value*> ValueName;
+using ValueName = StringMapEntry<Value*>;
//===----------------------------------------------------------------------===//
// Value Class
@@ -120,9 +120,11 @@ private:
template <typename UseT> // UseT == 'Use' or 'const Use'
class use_iterator_impl
: public std::iterator<std::forward_iterator_tag, UseT *> {
+ friend class Value;
+
UseT *U;
+
explicit use_iterator_impl(UseT *u) : U(u) {}
- friend class Value;
public:
use_iterator_impl() : U() {}
@@ -309,8 +311,9 @@ public:
return UseList == nullptr;
}
- typedef use_iterator_impl<Use> use_iterator;
- typedef use_iterator_impl<const Use> const_use_iterator;
+ using use_iterator = use_iterator_impl<Use>;
+ using const_use_iterator = use_iterator_impl<const Use>;
+
use_iterator materialized_use_begin() { return use_iterator(UseList); }
const_use_iterator materialized_use_begin() const {
return const_use_iterator(UseList);
@@ -345,8 +348,9 @@ public:
return UseList == nullptr;
}
- typedef user_iterator_impl<User> user_iterator;
- typedef user_iterator_impl<const User> const_user_iterator;
+ using user_iterator = user_iterator_impl<User>;
+ using const_user_iterator = user_iterator_impl<const User>;
+
user_iterator materialized_user_begin() { return user_iterator(UseList); }
const_user_iterator materialized_user_begin() const {
return const_user_iterator(UseList);
@@ -560,7 +564,6 @@ public:
/// block.
const Value *DoPHITranslation(const BasicBlock *CurBB,
const BasicBlock *PredBB) const;
-
Value *DoPHITranslation(const BasicBlock *CurBB, const BasicBlock *PredBB) {
return const_cast<Value *>(
static_cast<const Value *>(this)->DoPHITranslation(CurBB, PredBB));
@@ -606,7 +609,7 @@ private:
Use *Merged;
Use **Next = &Merged;
- for (;;) {
+ while (true) {
if (!L) {
*Next = R;
break;
diff --git a/include/llvm/IR/ValueHandle.h b/include/llvm/IR/ValueHandle.h
index 393618d5511b..b45cc7b6dc02 100644
--- a/include/llvm/IR/ValueHandle.h
+++ b/include/llvm/IR/ValueHandle.h
@@ -17,10 +17,10 @@
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/PointerIntPair.h"
#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
namespace llvm {
-class ValueHandleBase;
-template<typename From> struct simplify_type;
/// \brief This is the common base class of value handles.
///
@@ -29,6 +29,7 @@ template<typename From> struct simplify_type;
/// below for details.
class ValueHandleBase {
friend class Value;
+
protected:
/// \brief This indicates what sub class the handle actually is.
///
@@ -40,24 +41,23 @@ protected:
: ValueHandleBase(RHS.PrevPair.getInt(), RHS) {}
ValueHandleBase(HandleBaseKind Kind, const ValueHandleBase &RHS)
- : PrevPair(nullptr, Kind), Next(nullptr), Val(RHS.getValPtr()) {
+ : PrevPair(nullptr, Kind), Val(RHS.getValPtr()) {
if (isValid(getValPtr()))
AddToExistingUseList(RHS.getPrevPtr());
}
private:
PointerIntPair<ValueHandleBase**, 2, HandleBaseKind> PrevPair;
- ValueHandleBase *Next;
-
- Value *Val;
+ ValueHandleBase *Next = nullptr;
+ Value *Val = nullptr;
void setValPtr(Value *V) { Val = V; }
public:
explicit ValueHandleBase(HandleBaseKind Kind)
- : PrevPair(nullptr, Kind), Next(nullptr), Val(nullptr) {}
+ : PrevPair(nullptr, Kind) {}
ValueHandleBase(HandleBaseKind Kind, Value *V)
- : PrevPair(nullptr, Kind), Next(nullptr), Val(V) {
+ : PrevPair(nullptr, Kind), Val(V) {
if (isValid(getValPtr()))
AddToUseList();
}
@@ -162,11 +162,13 @@ public:
// Specialize simplify_type to allow WeakVH to participate in
// dyn_cast, isa, etc.
template <> struct simplify_type<WeakVH> {
- typedef Value *SimpleType;
+ using SimpleType = Value *;
+
static SimpleType getSimplifiedValue(WeakVH &WVH) { return WVH; }
};
template <> struct simplify_type<const WeakVH> {
- typedef Value *SimpleType;
+ using SimpleType = Value *;
+
static SimpleType getSimplifiedValue(const WeakVH &WVH) { return WVH; }
};
@@ -205,11 +207,13 @@ public:
// Specialize simplify_type to allow WeakTrackingVH to participate in
// dyn_cast, isa, etc.
template <> struct simplify_type<WeakTrackingVH> {
- typedef Value *SimpleType;
+ using SimpleType = Value *;
+
static SimpleType getSimplifiedValue(WeakTrackingVH &WVH) { return WVH; }
};
template <> struct simplify_type<const WeakTrackingVH> {
- typedef Value *SimpleType;
+ using SimpleType = Value *;
+
static SimpleType getSimplifiedValue(const WeakTrackingVH &WVH) {
return WVH;
}
@@ -236,7 +240,7 @@ class AssertingVH
: public ValueHandleBase
#endif
{
- friend struct DenseMapInfo<AssertingVH<ValueTy> >;
+ friend struct DenseMapInfo<AssertingVH<ValueTy>>;
#ifndef NDEBUG
Value *getRawValPtr() const { return ValueHandleBase::getValPtr(); }
@@ -282,20 +286,23 @@ public:
// Specialize DenseMapInfo to allow AssertingVH to participate in DenseMap.
template<typename T>
-struct DenseMapInfo<AssertingVH<T> > {
+struct DenseMapInfo<AssertingVH<T>> {
static inline AssertingVH<T> getEmptyKey() {
AssertingVH<T> Res;
Res.setRawValPtr(DenseMapInfo<Value *>::getEmptyKey());
return Res;
}
+
static inline AssertingVH<T> getTombstoneKey() {
AssertingVH<T> Res;
Res.setRawValPtr(DenseMapInfo<Value *>::getTombstoneKey());
return Res;
}
+
static unsigned getHashValue(const AssertingVH<T> &Val) {
return DenseMapInfo<Value *>::getHashValue(Val.getRawValPtr());
}
+
static bool isEqual(const AssertingVH<T> &LHS, const AssertingVH<T> &RHS) {
return DenseMapInfo<Value *>::isEqual(LHS.getRawValPtr(),
RHS.getRawValPtr());
@@ -303,7 +310,7 @@ struct DenseMapInfo<AssertingVH<T> > {
};
template <typename T>
-struct isPodLike<AssertingVH<T> > {
+struct isPodLike<AssertingVH<T>> {
#ifdef NDEBUG
static const bool value = true;
#else
@@ -356,7 +363,7 @@ public:
static Value *GetAsValue(const Value *V) { return const_cast<Value*>(V); }
public:
- TrackingVH() {}
+ TrackingVH() = default;
TrackingVH(ValueTy *P) { setValPtr(P); }
operator ValueTy*() const {
@@ -495,10 +502,12 @@ public:
PoisoningVH(ValueTy *P) : CallbackVH(GetAsValue(P)) {}
PoisoningVH(const PoisoningVH &RHS)
: CallbackVH(RHS), Poisoned(RHS.Poisoned) {}
+
~PoisoningVH() {
if (Poisoned)
clearValPtr();
}
+
PoisoningVH &operator=(const PoisoningVH &RHS) {
if (Poisoned)
clearValPtr();
@@ -523,14 +532,17 @@ template <typename T> struct DenseMapInfo<PoisoningVH<T>> {
Res.setRawValPtr(DenseMapInfo<Value *>::getEmptyKey());
return Res;
}
+
static inline PoisoningVH<T> getTombstoneKey() {
PoisoningVH<T> Res;
Res.setRawValPtr(DenseMapInfo<Value *>::getTombstoneKey());
return Res;
}
+
static unsigned getHashValue(const PoisoningVH<T> &Val) {
return DenseMapInfo<Value *>::getHashValue(Val.getRawValPtr());
}
+
static bool isEqual(const PoisoningVH<T> &LHS, const PoisoningVH<T> &RHS) {
return DenseMapInfo<Value *>::isEqual(LHS.getRawValPtr(),
RHS.getRawValPtr());
@@ -545,6 +557,6 @@ template <typename T> struct isPodLike<PoisoningVH<T>> {
#endif
};
-} // End llvm namespace
+} // end namespace llvm
-#endif
+#endif // LLVM_IR_VALUEHANDLE_H
diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h
index 9648e1989f94..11d5823ee479 100644
--- a/include/llvm/IR/ValueMap.h
+++ b/include/llvm/IR/ValueMap.h
@@ -46,7 +46,6 @@ namespace llvm {
template<typename KeyT, typename ValueT, typename Config>
class ValueMapCallbackVH;
-
template<typename DenseMapT, typename KeyT>
class ValueMapIterator;
template<typename DenseMapT, typename KeyT>
@@ -57,7 +56,7 @@ class ValueMapConstIterator;
/// as possible with future versions of ValueMap.
template<typename KeyT, typename MutexT = sys::Mutex>
struct ValueMapConfig {
- typedef MutexT mutex_type;
+ using mutex_type = MutexT;
/// If FollowRAUW is true, the ValueMap will update mappings on RAUW. If it's
/// false, the ValueMap will leave the original mapping in place.
@@ -87,21 +86,21 @@ template<typename KeyT, typename ValueT, typename Config =ValueMapConfig<KeyT>>
class ValueMap {
friend class ValueMapCallbackVH<KeyT, ValueT, Config>;
- typedef ValueMapCallbackVH<KeyT, ValueT, Config> ValueMapCVH;
- typedef DenseMap<ValueMapCVH, ValueT, DenseMapInfo<ValueMapCVH>> MapT;
- typedef DenseMap<const Metadata *, TrackingMDRef> MDMapT;
- typedef typename Config::ExtraData ExtraData;
+ using ValueMapCVH = ValueMapCallbackVH<KeyT, ValueT, Config>;
+ using MapT = DenseMap<ValueMapCVH, ValueT, DenseMapInfo<ValueMapCVH>>;
+ using MDMapT = DenseMap<const Metadata *, TrackingMDRef>;
+ using ExtraData = typename Config::ExtraData;
+
MapT Map;
Optional<MDMapT> MDMap;
ExtraData Data;
-
bool MayMapMetadata = true;
public:
- typedef KeyT key_type;
- typedef ValueT mapped_type;
- typedef std::pair<KeyT, ValueT> value_type;
- typedef unsigned size_type;
+ using key_type = KeyT;
+ using mapped_type = ValueT;
+ using value_type = std::pair<KeyT, ValueT>;
+ using size_type = unsigned;
explicit ValueMap(unsigned NumInitBuckets = 64)
: Map(NumInitBuckets), Data() {}
@@ -132,8 +131,9 @@ public:
return Where->second.get();
}
- typedef ValueMapIterator<MapT, KeyT> iterator;
- typedef ValueMapConstIterator<MapT, KeyT> const_iterator;
+ using iterator = ValueMapIterator<MapT, KeyT>;
+ using const_iterator = ValueMapConstIterator<MapT, KeyT>;
+
inline iterator begin() { return iterator(Map.begin()); }
inline iterator end() { return iterator(Map.end()); }
inline const_iterator begin() const { return const_iterator(Map.begin()); }
@@ -244,8 +244,8 @@ class ValueMapCallbackVH final : public CallbackVH {
friend class ValueMap<KeyT, ValueT, Config>;
friend struct DenseMapInfo<ValueMapCallbackVH>;
- typedef ValueMap<KeyT, ValueT, Config> ValueMapT;
- typedef typename std::remove_pointer<KeyT>::type KeySansPointerT;
+ using ValueMapT = ValueMap<KeyT, ValueT, Config>;
+ using KeySansPointerT = typename std::remove_pointer<KeyT>::type;
ValueMapT *Map;
@@ -298,7 +298,7 @@ public:
template<typename KeyT, typename ValueT, typename Config>
struct DenseMapInfo<ValueMapCallbackVH<KeyT, ValueT, Config>> {
- typedef ValueMapCallbackVH<KeyT, ValueT, Config> VH;
+ using VH = ValueMapCallbackVH<KeyT, ValueT, Config>;
static inline VH getEmptyKey() {
return VH(DenseMapInfo<Value *>::getEmptyKey());
@@ -330,8 +330,8 @@ class ValueMapIterator :
public std::iterator<std::forward_iterator_tag,
std::pair<KeyT, typename DenseMapT::mapped_type>,
ptrdiff_t> {
- typedef typename DenseMapT::iterator BaseT;
- typedef typename DenseMapT::mapped_type ValueT;
+ using BaseT = typename DenseMapT::iterator;
+ using ValueT = typename DenseMapT::mapped_type;
BaseT I;
@@ -344,7 +344,9 @@ public:
struct ValueTypeProxy {
const KeyT first;
ValueT& second;
+
ValueTypeProxy *operator->() { return this; }
+
operator std::pair<KeyT, ValueT>() const {
return std::make_pair(first, second);
}
@@ -380,8 +382,8 @@ class ValueMapConstIterator :
public std::iterator<std::forward_iterator_tag,
std::pair<KeyT, typename DenseMapT::mapped_type>,
ptrdiff_t> {
- typedef typename DenseMapT::const_iterator BaseT;
- typedef typename DenseMapT::mapped_type ValueT;
+ using BaseT = typename DenseMapT::const_iterator;
+ using ValueT = typename DenseMapT::mapped_type;
BaseT I;
diff --git a/include/llvm/IR/ValueSymbolTable.h b/include/llvm/IR/ValueSymbolTable.h
index 9e86751dae6f..26cbbfabfc0c 100644
--- a/include/llvm/IR/ValueSymbolTable.h
+++ b/include/llvm/IR/ValueSymbolTable.h
@@ -49,13 +49,13 @@ class ValueSymbolTable {
/// @{
public:
/// @brief A mapping of names to values.
- typedef StringMap<Value*> ValueMap;
+ using ValueMap = StringMap<Value*>;
/// @brief An iterator over a ValueMap.
- typedef ValueMap::iterator iterator;
+ using iterator = ValueMap::iterator;
/// @brief A const_iterator over a ValueMap.
- typedef ValueMap::const_iterator const_iterator;
+ using const_iterator = ValueMap::const_iterator;
/// @}
/// @name Constructors
diff --git a/include/llvm/IR/Verifier.h b/include/llvm/IR/Verifier.h
index 71f727c3d4fc..15e52d9e0742 100644
--- a/include/llvm/IR/Verifier.h
+++ b/include/llvm/IR/Verifier.h
@@ -21,13 +21,17 @@
#ifndef LLVM_IR_VERIFIER_H
#define LLVM_IR_VERIFIER_H
+#include "llvm/ADT/DenseMap.h"
#include "llvm/IR/PassManager.h"
+#include <utility>
namespace llvm {
+class APInt;
class Function;
class FunctionPass;
-class ModulePass;
+class Instruction;
+class MDNode;
class Module;
class raw_ostream;
struct VerifierSupport;
@@ -47,7 +51,7 @@ class TBAAVerifier {
/// the offset of the access. If zero, only a zero offset is allowed.
///
/// \c BitWidth has no meaning if \c IsInvalid is true.
- typedef std::pair<bool, unsigned> TBAABaseNodeSummary;
+ using TBAABaseNodeSummary = std::pair<bool, unsigned>;
DenseMap<const MDNode *, TBAABaseNodeSummary> TBAABaseNodes;
/// Maps an alleged scalar TBAA node to a boolean that is true if the said
@@ -101,12 +105,14 @@ FunctionPass *createVerifierPass(bool FatalErrors = true);
/// and debug info errors.
class VerifierAnalysis : public AnalysisInfoMixin<VerifierAnalysis> {
friend AnalysisInfoMixin<VerifierAnalysis>;
+
static AnalysisKey Key;
public:
struct Result {
bool IRBroken, DebugInfoBroken;
};
+
Result run(Module &M, ModuleAnalysisManager &);
Result run(Function &F, FunctionAnalysisManager &);
};
@@ -136,7 +142,6 @@ public:
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
+} // end namespace llvm
-} // End llvm namespace
-
-#endif
+#endif // LLVM_IR_VERIFIER_H
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 44ff4c1a581b..cf314e19d1ca 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -130,6 +130,7 @@ void initializeEfficiencySanitizerPass(PassRegistry&);
void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&);
void initializeExpandISelPseudosPass(PassRegistry&);
void initializeExpandPostRAPass(PassRegistry&);
+void initializeExpandReductionsPass(PassRegistry&);
void initializeExternalAAWrapperPassPass(PassRegistry&);
void initializeFEntryInserterPass(PassRegistry&);
void initializeFinalizeMachineBundlesPass(PassRegistry&);
@@ -186,6 +187,7 @@ void initializeLintPass(PassRegistry&);
void initializeLiveDebugValuesPass(PassRegistry&);
void initializeLiveDebugVariablesPass(PassRegistry&);
void initializeLiveIntervalsPass(PassRegistry&);
+void initializeLiveRangeShrinkPass(PassRegistry&);
void initializeLiveRegMatrixPass(PassRegistry&);
void initializeLiveStacksPass(PassRegistry&);
void initializeLiveVariablesPass(PassRegistry&);
@@ -319,11 +321,12 @@ void initializeSCCPLegacyPassPass(PassRegistry&);
void initializeSCEVAAWrapperPassPass(PassRegistry&);
void initializeSLPVectorizerPass(PassRegistry&);
void initializeSROALegacyPassPass(PassRegistry&);
-void initializeSafeStackPass(PassRegistry&);
+void initializeSafeStackLegacyPassPass(PassRegistry&);
void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&);
void initializeSanitizerCoverageModulePass(PassRegistry&);
void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
void initializeScalarizerPass(PassRegistry&);
+void initializeScalarizeMaskedMemIntrinPass(PassRegistry&);
void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&);
void initializeSeparateConstOffsetFromGEPPass(PassRegistry&);
void initializeShadowStackGCLoweringPass(PassRegistry&);
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 39a86e838bde..5c398b2ab567 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -206,6 +206,7 @@ namespace {
(void) llvm::createMemDerefPrinter();
(void) llvm::createFloat2IntPass();
(void) llvm::createEliminateAvailableExternallyPass();
+ (void) llvm::createScalarizeMaskedMemIntrinPass();
(void)new llvm::IntervalPartition();
(void)new llvm::ScalarEvolutionWrapperPass();
diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h
index 4bc39d98b7af..d200d4a148e3 100644
--- a/include/llvm/Object/Wasm.h
+++ b/include/llvm/Object/Wasm.h
@@ -67,7 +67,8 @@ public:
WasmObjectFile(MemoryBufferRef Object, Error &Err);
const wasm::WasmObjectHeader &getHeader() const;
- const WasmSymbol &getWasmSymbol(DataRefImpl Symb) const;
+ const WasmSymbol &getWasmSymbol(const DataRefImpl &Symb) const;
+ const WasmSymbol &getWasmSymbol(const SymbolRef &Symbol) const;
const WasmSection &getWasmSection(const SectionRef &Section) const;
const wasm::WasmRelocation &getWasmRelocation(const RelocationRef& Ref) const;
@@ -81,6 +82,10 @@ public:
const std::vector<wasm::WasmGlobal>& globals() const { return Globals; }
const std::vector<wasm::WasmExport>& exports() const { return Exports; }
+ uint32_t getNumberOfSymbols() const {
+ return Symbols.size();
+ }
+
const std::vector<wasm::WasmElemSegment>& elements() const {
return ElemSegments;
}
diff --git a/include/llvm/ObjectYAML/WasmYAML.h b/include/llvm/ObjectYAML/WasmYAML.h
index bd7d72be4dbc..7b70c9537827 100644
--- a/include/llvm/ObjectYAML/WasmYAML.h
+++ b/include/llvm/ObjectYAML/WasmYAML.h
@@ -34,17 +34,6 @@ struct FileHeader {
yaml::Hex32 Version;
};
-struct Import {
- StringRef Module;
- StringRef Field;
- ExportKind Kind;
- union {
- uint32_t SigIndex;
- ValueType GlobalType;
- };
- bool GlobalMutable;
-};
-
struct Limits {
yaml::Hex32 Flags;
yaml::Hex32 Initial;
@@ -74,6 +63,18 @@ struct Global {
wasm::WasmInitExpr InitExpr;
};
+struct Import {
+ StringRef Module;
+ StringRef Field;
+ ExportKind Kind;
+ union {
+ uint32_t SigIndex;
+ Global GlobalImport;
+ Table TableImport;
+ Limits Memory;
+ };
+};
+
struct LocalDecl {
ValueType Type;
uint32_t Count;
diff --git a/include/llvm/ProfileData/SampleProfWriter.h b/include/llvm/ProfileData/SampleProfWriter.h
index 9d69af32dd46..86af1038d74e 100644
--- a/include/llvm/ProfileData/SampleProfWriter.h
+++ b/include/llvm/ProfileData/SampleProfWriter.h
@@ -43,16 +43,7 @@ public:
/// Write all the sample profiles in the given map of samples.
///
/// \returns status code of the file update operation.
- std::error_code write(const StringMap<FunctionSamples> &ProfileMap) {
- if (std::error_code EC = writeHeader(ProfileMap))
- return EC;
- for (const auto &I : ProfileMap) {
- const FunctionSamples &Profile = I.second;
- if (std::error_code EC = write(Profile))
- return EC;
- }
- return sampleprof_error::success;
- }
+ std::error_code write(const StringMap<FunctionSamples> &ProfileMap);
raw_ostream &getOutputStream() { return *OutputStream; }
diff --git a/include/llvm/Support/BinaryStreamArray.h b/include/llvm/Support/BinaryStreamArray.h
index bad31cd38d6a..77c99ffff919 100644
--- a/include/llvm/Support/BinaryStreamArray.h
+++ b/include/llvm/Support/BinaryStreamArray.h
@@ -139,6 +139,7 @@ public:
}
uint32_t offset() const { return AbsOffset; }
+ uint32_t getRecordLength() const { return ThisLen; }
private:
void moveToEnd() {
@@ -294,6 +295,8 @@ template <typename T> class FixedStreamArray {
friend class FixedStreamArrayIterator<T>;
public:
+ typedef FixedStreamArrayIterator<T> Iterator;
+
FixedStreamArray() = default;
explicit FixedStreamArray(BinaryStreamRef Stream) : Stream(Stream) {
assert(Stream.getLength() % sizeof(T) == 0);
@@ -371,7 +374,7 @@ public:
}
FixedStreamArrayIterator<T> &operator-=(std::ptrdiff_t N) {
- assert(Index >= N);
+ assert(std::ptrdiff_t(Index) >= N);
Index -= N;
return *this;
}
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index a56bc93e111b..be9e46540016 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -111,12 +111,6 @@
#define LLVM_PREFETCH(addr, rw, locality)
#endif
-#if __has_attribute(sentinel) || LLVM_GNUC_PREREQ(3, 0, 0)
-#define LLVM_END_WITH_NULL __attribute__((sentinel))
-#else
-#define LLVM_END_WITH_NULL
-#endif
-
#if __has_attribute(used) || LLVM_GNUC_PREREQ(3, 1, 0)
#define LLVM_ATTRIBUTE_USED __attribute__((__used__))
#else
@@ -233,6 +227,8 @@
/// LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
#if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
#define LLVM_FALLTHROUGH [[fallthrough]]
+#elif __has_cpp_attribute(gnu::fallthrough)
+#define LLVM_FALLTHROUGH [[gnu::fallthrough]]
#elif !__cplusplus
// Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
// error when __has_cpp_attribute is given a scoped attribute in C mode.
diff --git a/include/llvm/Support/KnownBits.h b/include/llvm/Support/KnownBits.h
index 3d38cf878538..2c77d40559b9 100644
--- a/include/llvm/Support/KnownBits.h
+++ b/include/llvm/Support/KnownBits.h
@@ -133,6 +133,66 @@ public:
KnownBits zextOrTrunc(unsigned BitWidth) {
return KnownBits(Zero.zextOrTrunc(BitWidth), One.zextOrTrunc(BitWidth));
}
+
+ /// Returns the minimum number of trailing zero bits.
+ unsigned countMinTrailingZeros() const {
+ return Zero.countTrailingOnes();
+ }
+
+ /// Returns the minimum number of trailing one bits.
+ unsigned countMinTrailingOnes() const {
+ return One.countTrailingOnes();
+ }
+
+ /// Returns the minimum number of leading zero bits.
+ unsigned countMinLeadingZeros() const {
+ return Zero.countLeadingOnes();
+ }
+
+ /// Returns the minimum number of leading one bits.
+ unsigned countMinLeadingOnes() const {
+ return One.countLeadingOnes();
+ }
+
+ /// Returns the number of times the sign bit is replicated into the other
+ /// bits.
+ unsigned countMinSignBits() const {
+ if (isNonNegative())
+ return countMinLeadingZeros();
+ if (isNegative())
+ return countMinLeadingOnes();
+ return 0;
+ }
+
+ /// Returns the maximum number of trailing zero bits possible.
+ unsigned countMaxTrailingZeros() const {
+ return One.countTrailingZeros();
+ }
+
+ /// Returns the maximum number of trailing one bits possible.
+ unsigned countMaxTrailingOnes() const {
+ return Zero.countTrailingZeros();
+ }
+
+ /// Returns the maximum number of leading zero bits possible.
+ unsigned countMaxLeadingZeros() const {
+ return One.countLeadingZeros();
+ }
+
+ /// Returns the maximum number of leading one bits possible.
+ unsigned countMaxLeadingOnes() const {
+ return Zero.countLeadingZeros();
+ }
+
+ /// Returns the number of bits known to be one.
+ unsigned countMinPopulation() const {
+ return One.countPopulation();
+ }
+
+ /// Returns the maximum number of bits that could be one.
+ unsigned countMaxPopulation() const {
+ return getBitWidth() - Zero.countPopulation();
+ }
};
} // end namespace llvm
diff --git a/include/llvm/Support/Parallel.h b/include/llvm/Support/Parallel.h
new file mode 100644
index 000000000000..e36e0cc29e14
--- /dev/null
+++ b/include/llvm/Support/Parallel.h
@@ -0,0 +1,249 @@
+//===- llvm/Support/Parallel.h - Parallel algorithms ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_PARALLEL_H
+#define LLVM_SUPPORT_PARALLEL_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <algorithm>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+#if defined(_MSC_VER) && LLVM_ENABLE_THREADS
+#pragma warning(push)
+#pragma warning(disable : 4530)
+#include <concrt.h>
+#include <ppl.h>
+#pragma warning(pop)
+#endif
+
+namespace llvm {
+
+namespace parallel {
+struct sequential_execution_policy {};
+struct parallel_execution_policy {};
+
+template <typename T>
+struct is_execution_policy
+ : public std::integral_constant<
+ bool, llvm::is_one_of<T, sequential_execution_policy,
+ parallel_execution_policy>::value> {};
+
+constexpr sequential_execution_policy seq{};
+constexpr parallel_execution_policy par{};
+
+namespace detail {
+
+#if LLVM_ENABLE_THREADS
+
+class Latch {
+ uint32_t Count;
+ mutable std::mutex Mutex;
+ mutable std::condition_variable Cond;
+
+public:
+ explicit Latch(uint32_t Count = 0) : Count(Count) {}
+ ~Latch() { sync(); }
+
+ void inc() {
+ std::unique_lock<std::mutex> lock(Mutex);
+ ++Count;
+ }
+
+ void dec() {
+ std::unique_lock<std::mutex> lock(Mutex);
+ if (--Count == 0)
+ Cond.notify_all();
+ }
+
+ void sync() const {
+ std::unique_lock<std::mutex> lock(Mutex);
+ Cond.wait(lock, [&] { return Count == 0; });
+ }
+};
+
+class TaskGroup {
+ Latch L;
+
+public:
+ void spawn(std::function<void()> f);
+
+ void sync() const { L.sync(); }
+};
+
+#if defined(_MSC_VER)
+template <class RandomAccessIterator, class Comparator>
+void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End,
+ const Comparator &Comp) {
+ concurrency::parallel_sort(Start, End, Comp);
+}
+template <class IterTy, class FuncTy>
+void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
+ concurrency::parallel_for_each(Begin, End, Fn);
+}
+
+template <class IndexTy, class FuncTy>
+void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
+ concurrency::parallel_for(Begin, End, Fn);
+}
+
+#else
+const ptrdiff_t MinParallelSize = 1024;
+
+/// \brief Inclusive median.
+template <class RandomAccessIterator, class Comparator>
+RandomAccessIterator medianOf3(RandomAccessIterator Start,
+ RandomAccessIterator End,
+ const Comparator &Comp) {
+ RandomAccessIterator Mid = Start + (std::distance(Start, End) / 2);
+ return Comp(*Start, *(End - 1))
+ ? (Comp(*Mid, *(End - 1)) ? (Comp(*Start, *Mid) ? Mid : Start)
+ : End - 1)
+ : (Comp(*Mid, *Start) ? (Comp(*(End - 1), *Mid) ? Mid : End - 1)
+ : Start);
+}
+
+template <class RandomAccessIterator, class Comparator>
+void parallel_quick_sort(RandomAccessIterator Start, RandomAccessIterator End,
+ const Comparator &Comp, TaskGroup &TG, size_t Depth) {
+ // Do a sequential sort for small inputs.
+ if (std::distance(Start, End) < detail::MinParallelSize || Depth == 0) {
+ std::sort(Start, End, Comp);
+ return;
+ }
+
+ // Partition.
+ auto Pivot = medianOf3(Start, End, Comp);
+ // Move Pivot to End.
+ std::swap(*(End - 1), *Pivot);
+ Pivot = std::partition(Start, End - 1, [&Comp, End](decltype(*Start) V) {
+ return Comp(V, *(End - 1));
+ });
+ // Move Pivot to middle of partition.
+ std::swap(*Pivot, *(End - 1));
+
+ // Recurse.
+ TG.spawn([=, &Comp, &TG] {
+ parallel_quick_sort(Start, Pivot, Comp, TG, Depth - 1);
+ });
+ parallel_quick_sort(Pivot + 1, End, Comp, TG, Depth - 1);
+}
+
+template <class RandomAccessIterator, class Comparator>
+void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End,
+ const Comparator &Comp) {
+ TaskGroup TG;
+ parallel_quick_sort(Start, End, Comp, TG,
+ llvm::Log2_64(std::distance(Start, End)) + 1);
+}
+
+template <class IterTy, class FuncTy>
+void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) {
+ // TaskGroup has a relatively high overhead, so we want to reduce
+ // the number of spawn() calls. We'll create up to 1024 tasks here.
+ // (Note that 1024 is an arbitrary number. This code probably needs
+ // improving to take the number of available cores into account.)
+ ptrdiff_t TaskSize = std::distance(Begin, End) / 1024;
+ if (TaskSize == 0)
+ TaskSize = 1;
+
+ TaskGroup TG;
+ while (TaskSize <= std::distance(Begin, End)) {
+ TG.spawn([=, &Fn] { std::for_each(Begin, Begin + TaskSize, Fn); });
+ Begin += TaskSize;
+ }
+ TG.spawn([=, &Fn] { std::for_each(Begin, End, Fn); });
+}
+
+template <class IndexTy, class FuncTy>
+void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) {
+ ptrdiff_t TaskSize = (End - Begin) / 1024;
+ if (TaskSize == 0)
+ TaskSize = 1;
+
+ TaskGroup TG;
+ IndexTy I = Begin;
+ for (; I + TaskSize < End; I += TaskSize) {
+ TG.spawn([=, &Fn] {
+ for (IndexTy J = I, E = I + TaskSize; J != E; ++J)
+ Fn(J);
+ });
+ }
+ TG.spawn([=, &Fn] {
+ for (IndexTy J = I; J < End; ++J)
+ Fn(J);
+ });
+}
+
+#endif
+
+#endif
+
+template <typename Iter>
+using DefComparator =
+ std::less<typename std::iterator_traits<Iter>::value_type>;
+
+} // namespace detail
+
+// sequential algorithm implementations.
+template <class Policy, class RandomAccessIterator,
+ class Comparator = detail::DefComparator<RandomAccessIterator>>
+void sort(Policy policy, RandomAccessIterator Start, RandomAccessIterator End,
+ const Comparator &Comp = Comparator()) {
+ static_assert(is_execution_policy<Policy>::value,
+ "Invalid execution policy!");
+ std::sort(Start, End, Comp);
+}
+
+template <class Policy, class IterTy, class FuncTy>
+void for_each(Policy policy, IterTy Begin, IterTy End, FuncTy Fn) {
+ static_assert(is_execution_policy<Policy>::value,
+ "Invalid execution policy!");
+ std::for_each(Begin, End, Fn);
+}
+
+template <class Policy, class IndexTy, class FuncTy>
+void for_each_n(Policy policy, IndexTy Begin, IndexTy End, FuncTy Fn) {
+ static_assert(is_execution_policy<Policy>::value,
+ "Invalid execution policy!");
+ for (IndexTy I = Begin; I != End; ++I)
+ Fn(I);
+}
+
+// Parallel algorithm implementations, only available when LLVM_ENABLE_THREADS
+// is true.
+#if LLVM_ENABLE_THREADS
+template <class RandomAccessIterator,
+ class Comparator = detail::DefComparator<RandomAccessIterator>>
+void sort(parallel_execution_policy policy, RandomAccessIterator Start,
+ RandomAccessIterator End, const Comparator &Comp = Comparator()) {
+ detail::parallel_sort(Start, End, Comp);
+}
+
+template <class IterTy, class FuncTy>
+void for_each(parallel_execution_policy policy, IterTy Begin, IterTy End,
+ FuncTy Fn) {
+ detail::parallel_for_each(Begin, End, Fn);
+}
+
+template <class IndexTy, class FuncTy>
+void for_each_n(parallel_execution_policy policy, IndexTy Begin, IndexTy End,
+ FuncTy Fn) {
+ detail::parallel_for_each_n(Begin, End, Fn);
+}
+#endif
+
+} // namespace parallel
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_PARALLEL_H
diff --git a/include/llvm/Support/Wasm.h b/include/llvm/Support/Wasm.h
index a48dfe10b3bb..e3831827062c 100644
--- a/include/llvm/Support/Wasm.h
+++ b/include/llvm/Support/Wasm.h
@@ -37,17 +37,6 @@ struct WasmSignature {
int32_t ReturnType;
};
-struct WasmImport {
- StringRef Module;
- StringRef Field;
- uint32_t Kind;
- union {
- uint32_t SigIndex;
- int32_t GlobalType;
- };
- bool GlobalMutable;
-};
-
struct WasmExport {
StringRef Name;
uint32_t Kind;
@@ -82,6 +71,18 @@ struct WasmGlobal {
WasmInitExpr InitExpr;
};
+struct WasmImport {
+ StringRef Module;
+ StringRef Field;
+ uint32_t Kind;
+ union {
+ uint32_t SigIndex;
+ WasmGlobal Global;
+ WasmTable Table;
+ WasmLimits Memory;
+ };
+};
+
struct WasmLocalDecl {
int32_t Type;
uint32_t Count;
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index fc35b4527bc3..6f44292c47ed 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -680,6 +680,11 @@ class RegisterOperand<RegisterClass regclass, string pm = "printOperand">
// this type. The method normally will just use an alt-name index to look
// up the name to print. Default to the generic printOperand().
string PrintMethod = pm;
+
+ // EncoderMethod - The target method name to call to encode this register
+ // operand.
+ string EncoderMethod = "";
+
// ParserMatchClass - The "match class" that operands of this type fit
// in. Match classes are used to define the order in which instructions are
// match, to ensure that which instructions gets matched is deterministic.
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 82a682cf1f7e..97a6f0c6e3ae 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -172,11 +172,22 @@ public:
/// inalloca arguments. This function reports only the size of the frame part
/// that is set up between the frame setup and destroy pseudo instructions.
int64_t getFrameSize(const MachineInstr &I) const {
- assert(isFrameInstr(I));
+ assert(isFrameInstr(I) && "Not a frame instruction");
assert(I.getOperand(0).getImm() >= 0);
return I.getOperand(0).getImm();
}
+ /// Returns the total frame size, which is made up of the space set up inside
+ /// the pair of frame start-stop instructions and the space that is set up
+ /// prior to the pair.
+ int64_t getFrameTotalSize(const MachineInstr &I) const {
+ if (isFrameSetup(I)) {
+ assert(I.getOperand(1).getImm() >= 0 && "Frame size must not be negative");
+ return getFrameSize(I) + I.getOperand(1).getImm();
+ }
+ return getFrameSize(I);
+ }
+
unsigned getCatchReturnOpcode() const { return CatchRetOpcode; }
unsigned getReturnOpcode() const { return ReturnOpcode; }
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index ced183852146..1ca32d4c3589 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -1396,7 +1396,10 @@ public:
/// It is called by AtomicExpandPass before expanding an
/// AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad
/// if shouldInsertFencesForAtomic returns true.
- /// RMW and CmpXchg set both IsStore and IsLoad to true.
+ ///
+ /// Inst is the original atomic instruction, prior to other expansions that
+ /// may be performed.
+ ///
/// This function should either return a nullptr, or a pointer to an IR-level
/// Instruction*. Even complex fence sequences can be represented by a
/// single Instruction* through an intrinsic to be lowered later.
@@ -1422,18 +1425,17 @@ public:
/// seq_cst. But if they are lowered to monotonic accesses, no amount of
/// IR-level fences can prevent it.
/// @{
- virtual Instruction *emitLeadingFence(IRBuilder<> &Builder,
- AtomicOrdering Ord, bool IsStore,
- bool IsLoad) const {
- if (isReleaseOrStronger(Ord) && IsStore)
+ virtual Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const {
+ if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
return Builder.CreateFence(Ord);
else
return nullptr;
}
virtual Instruction *emitTrailingFence(IRBuilder<> &Builder,
- AtomicOrdering Ord, bool IsStore,
- bool IsLoad) const {
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
if (isAcquireOrStronger(Ord))
return Builder.CreateFence(Ord);
else
@@ -2061,14 +2063,6 @@ public:
return false;
}
- // Return true if the instruction that performs a << b actually performs
- // a << (b % (sizeof(a) * 8)).
- virtual bool supportsModuloShift(ISD::NodeType Inst, EVT ReturnType) const {
- assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) &&
- "Expect a shift instruction");
- return false;
- }
-
//===--------------------------------------------------------------------===//
// Runtime Library hooks
//
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index d342e4fe2613..7b00c9420e35 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -334,7 +334,7 @@ class ReadAdvance<SchedRead read, int cycles, list<SchedWrite> writes = []>
}
// Directly associate a new SchedRead type with a delay and optional
-// pipeline bypess. For use with InstRW or ItinRW.
+// pipeline bypass. For use with InstRW or ItinRW.
class SchedReadAdvance<int cycles, list<SchedWrite> writes = []> : SchedRead,
ProcReadAdvance<cycles, writes>;
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 45a842f77a21..9ed614ccee17 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -281,7 +281,7 @@ def SDTConvertOp : SDTypeProfile<1, 5, [ //cvtss, su, us, uu, ff, fs, fu, sf, su
]>;
class SDCallSeqStart<list<SDTypeConstraint> constraints> :
- SDTypeProfile<0, 1, constraints>;
+ SDTypeProfile<0, 2, constraints>;
class SDCallSeqEnd<list<SDTypeConstraint> constraints> :
SDTypeProfile<0, 2, constraints>;
diff --git a/include/llvm/LibDriver/LibDriver.h b/include/llvm/ToolDrivers/llvm-lib/LibDriver.h
index 95feb60be403..a4806ac4ad69 100644
--- a/include/llvm/LibDriver/LibDriver.h
+++ b/include/llvm/ToolDrivers/llvm-lib/LibDriver.h
@@ -1,4 +1,4 @@
-//===- llvm/LibDriver/LibDriver.h - lib.exe-compatible driver ---*- C++ -*-===//
+//===- llvm-lib/LibDriver.h - lib.exe-compatible driver ---------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBDRIVER_LIBDRIVER_H
-#define LLVM_LIBDRIVER_LIBDRIVER_H
+#ifndef LLVM_TOOLDRIVERS_LLVM_LIB_LIBDRIVER_H
+#define LLVM_TOOLDRIVERS_LLVM_LIB_LIBDRIVER_H
namespace llvm {
template <typename T> class ArrayRef;
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index 0a8903a6ed7b..91c9d255302f 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -43,6 +43,7 @@ class InvokeInst;
class Loop;
class LoopInfo;
class Module;
+class ProfileSummaryInfo;
class ReturnInst;
/// Return an exact copy of the specified module
@@ -175,15 +176,17 @@ public:
explicit InlineFunctionInfo(CallGraph *cg = nullptr,
std::function<AssumptionCache &(Function &)>
*GetAssumptionCache = nullptr,
+ ProfileSummaryInfo *PSI = nullptr,
BlockFrequencyInfo *CallerBFI = nullptr,
BlockFrequencyInfo *CalleeBFI = nullptr)
- : CG(cg), GetAssumptionCache(GetAssumptionCache), CallerBFI(CallerBFI),
- CalleeBFI(CalleeBFI) {}
+ : CG(cg), GetAssumptionCache(GetAssumptionCache), PSI(PSI),
+ CallerBFI(CallerBFI), CalleeBFI(CalleeBFI) {}
/// CG - If non-null, InlineFunction will update the callgraph to reflect the
/// changes it makes.
CallGraph *CG;
std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
+ ProfileSummaryInfo *PSI;
BlockFrequencyInfo *CallerBFI, *CalleeBFI;
/// StaticAllocas - InlineFunction fills this in with all static allocas that
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index a1cf41d6f931..561f94880624 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -21,6 +21,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
@@ -42,6 +43,7 @@ class PredIteratorCache;
class ScalarEvolution;
class SCEV;
class TargetLibraryInfo;
+class TargetTransformInfo;
/// \brief Captures loop safety information.
/// It keep information for loop & its header may throw exception.
@@ -489,6 +491,36 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE = nullptr);
+/// Generates a vector reduction using shufflevectors to reduce the value.
+Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+ RecurrenceDescriptor::MinMaxRecurrenceKind
+ MinMaxKind = RecurrenceDescriptor::MRK_Invalid,
+ ArrayRef<Value *> RedOps = ArrayRef<Value *>());
+
+/// Create a target reduction of the given vector. The reduction operation
+/// is described by the \p Opcode parameter. min/max reductions require
+/// additional information supplied in \p Flags.
+/// The target is queried to determine if intrinsics or shuffle sequences are
+/// required to implement the reduction.
+Value *
+createSimpleTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI,
+ unsigned Opcode, Value *Src,
+ TargetTransformInfo::ReductionFlags Flags =
+ TargetTransformInfo::ReductionFlags(),
+ ArrayRef<Value *> RedOps = ArrayRef<Value *>());
+
+/// Create a generic target reduction using a recurrence descriptor \p Desc
+/// The target is queried to determine if intrinsics or shuffle sequences are
+/// required to implement the reduction.
+Value *createTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI,
+ RecurrenceDescriptor &Desc, Value *Src,
+ bool NoNaN = false);
+
+/// Get the intersection (logical and) of all of the potential IR flags
+/// of each scalar operation (VL) that will be converted into a vector (I).
+/// Flag set: NSW, NUW, exact, and all of fast-math.
+void propagateIRFlags(Value *I, ArrayRef<Value *> VL);
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H
diff --git a/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 10338f7937e8..c514db41623c 100644
--- a/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -24,6 +24,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
@@ -59,7 +60,8 @@ public:
// Glue for old PM.
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_,
- DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_);
+ DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_,
+ OptimizationRemarkEmitter *ORE_);
private:
/// \brief Collect store and getelementptr instructions and organize them
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index 59b1f1621039..5e15e8d49802 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -148,6 +148,7 @@ module LLVM_intrinsic_gen {
module IR_Attributes { header "IR/Attributes.h" export * }
module IR_CallSite { header "IR/CallSite.h" export * }
module IR_ConstantFolder { header "IR/ConstantFolder.h" export * }
+ module IR_GlobalVariable { header "IR/GlobalVariable.h" export * }
module IR_NoFolder { header "IR/NoFolder.h" export * }
module IR_Module { header "IR/Module.h" export * }
module IR_ModuleSummaryIndex { header "IR/ModuleSummaryIndex.h" export * }
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 537823020301..a33c01a0e461 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -17,13 +17,13 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
@@ -36,6 +36,7 @@
#include "llvm/IR/Operator.h"
#include "llvm/Pass.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include <algorithm>
#define DEBUG_TYPE "basicaa"
@@ -1283,9 +1284,9 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
// give up if we can't determine conditions that hold for every cycle:
const Value *V = DecompGEP1.VarIndices[i].V;
- bool SignKnownZero, SignKnownOne;
- ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, DL,
- 0, &AC, nullptr, DT);
+ KnownBits Known = computeKnownBits(V, DL, 0, &AC, nullptr, DT);
+ bool SignKnownZero = Known.isNonNegative();
+ bool SignKnownOne = Known.isNegative();
// Zero-extension widens the variable, and so forces the sign
// bit to zero.
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 0dc4475ca0e2..db87b17c1567 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -301,6 +301,8 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
WeightSum += Weights[i];
}
}
+ assert(WeightSum <= UINT32_MAX &&
+ "Expected weights to scale down to 32 bits");
if (WeightSum == 0 || ReachableIdxs.size() == 0) {
for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
@@ -328,21 +330,14 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
// the difference between reachable blocks.
if (ToDistribute > BranchProbability::getZero()) {
BranchProbability PerEdge = ToDistribute / ReachableIdxs.size();
- for (auto i : ReachableIdxs) {
+ for (auto i : ReachableIdxs)
BP[i] += PerEdge;
- ToDistribute -= PerEdge;
- }
- // Tail goes to the first reachable edge.
- BP[ReachableIdxs[0]] += ToDistribute;
}
}
for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
setEdgeProbability(BB, i, BP[i]);
- assert(WeightSum <= UINT32_MAX &&
- "Expected weights to scale down to 32 bits");
-
return true;
}
diff --git a/lib/Analysis/CallGraph.cpp b/lib/Analysis/CallGraph.cpp
index 6942176ae6ae..ff5242f69a1b 100644
--- a/lib/Analysis/CallGraph.cpp
+++ b/lib/Analysis/CallGraph.cpp
@@ -21,23 +21,18 @@ using namespace llvm;
//
CallGraph::CallGraph(Module &M)
- : M(M), Root(nullptr), ExternalCallingNode(getOrInsertFunction(nullptr)),
+ : M(M), ExternalCallingNode(getOrInsertFunction(nullptr)),
CallsExternalNode(llvm::make_unique<CallGraphNode>(nullptr)) {
// Add every function to the call graph.
for (Function &F : M)
addToCallGraph(&F);
-
- // If we didn't find a main function, use the external call graph node
- if (!Root)
- Root = ExternalCallingNode;
}
CallGraph::CallGraph(CallGraph &&Arg)
- : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)), Root(Arg.Root),
+ : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)),
ExternalCallingNode(Arg.ExternalCallingNode),
CallsExternalNode(std::move(Arg.CallsExternalNode)) {
Arg.FunctionMap.clear();
- Arg.Root = nullptr;
Arg.ExternalCallingNode = nullptr;
}
@@ -57,21 +52,9 @@ CallGraph::~CallGraph() {
void CallGraph::addToCallGraph(Function *F) {
CallGraphNode *Node = getOrInsertFunction(F);
- // If this function has external linkage, anything could call it.
- if (!F->hasLocalLinkage()) {
- ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
- // Found the entry point?
- if (F->getName() == "main") {
- if (Root) // Found multiple external mains? Don't pick one.
- Root = ExternalCallingNode;
- else
- Root = Node; // Found a main, keep track of it!
- }
- }
-
- // If this function has its address taken, anything could call it.
- if (F->hasAddressTaken())
+ // If this function has external linkage or has its address taken, anything
+ // could call it.
+ if (!F->hasLocalLinkage() || F->hasAddressTaken())
ExternalCallingNode->addCalledFunction(CallSite(), Node);
// If this function is not defined in this translation unit, it could call
@@ -96,13 +79,6 @@ void CallGraph::addToCallGraph(Function *F) {
}
void CallGraph::print(raw_ostream &OS) const {
- OS << "CallGraph Root is: ";
- if (Function *F = Root->getFunction())
- OS << F->getName() << "\n";
- else {
- OS << "<<null function: 0x" << Root << ">>\n";
- }
-
// Print in a deterministic order by sorting CallGraphNodes by name. We do
// this here to avoid slowing down the non-printing fast path.
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 130e917e49d7..0ca712bbfe70 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1438,6 +1438,36 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
Name == "sinf" || Name == "sinhf" || Name == "sqrtf";
case 't':
return Name == "tan" || Name == "tanh" || Name == "tanf" || Name == "tanhf";
+ case '_':
+
+ // Check for various function names that get used for the math functions
+ // when the header files are preprocessed with the macro
+ // __FINITE_MATH_ONLY__ enabled.
+ // The '12' here is the length of the shortest name that can match.
+ // We need to check the size before looking at Name[1] and Name[2]
+ // so we may as well check a limit that will eliminate mismatches.
+ if (Name.size() < 12 || Name[1] != '_')
+ return false;
+ switch (Name[2]) {
+ default:
+ return false;
+ case 'a':
+ return Name == "__acos_finite" || Name == "__acosf_finite" ||
+ Name == "__asin_finite" || Name == "__asinf_finite" ||
+ Name == "__atan2_finite" || Name == "__atan2f_finite";
+ case 'c':
+ return Name == "__cosh_finite" || Name == "__coshf_finite";
+ case 'e':
+ return Name == "__exp_finite" || Name == "__expf_finite" ||
+ Name == "__exp2_finite" || Name == "__exp2f_finite";
+ case 'l':
+ return Name == "__log_finite" || Name == "__logf_finite" ||
+ Name == "__log10_finite" || Name == "__log10f_finite";
+ case 'p':
+ return Name == "__pow_finite" || Name == "__powf_finite";
+ case 's':
+ return Name == "__sinh_finite" || Name == "__sinhf_finite";
+ }
}
}
@@ -1637,13 +1667,21 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
if (!TLI)
return nullptr;
- switch (Name[0]) {
+ char NameKeyChar = Name[0];
+ if (Name[0] == '_' && Name.size() > 2 && Name[1] == '_')
+ NameKeyChar = Name[2];
+
+ switch (NameKeyChar) {
case 'a':
if ((Name == "acos" && TLI->has(LibFunc_acos)) ||
- (Name == "acosf" && TLI->has(LibFunc_acosf)))
+ (Name == "acosf" && TLI->has(LibFunc_acosf)) ||
+ (Name == "__acos_finite" && TLI->has(LibFunc_acos_finite)) ||
+ (Name == "__acosf_finite" && TLI->has(LibFunc_acosf_finite)))
return ConstantFoldFP(acos, V, Ty);
else if ((Name == "asin" && TLI->has(LibFunc_asin)) ||
- (Name == "asinf" && TLI->has(LibFunc_asinf)))
+ (Name == "asinf" && TLI->has(LibFunc_asinf)) ||
+ (Name == "__asin_finite" && TLI->has(LibFunc_asin_finite)) ||
+ (Name == "__asinf_finite" && TLI->has(LibFunc_asinf_finite)))
return ConstantFoldFP(asin, V, Ty);
else if ((Name == "atan" && TLI->has(LibFunc_atan)) ||
(Name == "atanf" && TLI->has(LibFunc_atanf)))
@@ -1657,15 +1695,21 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
(Name == "cosf" && TLI->has(LibFunc_cosf)))
return ConstantFoldFP(cos, V, Ty);
else if ((Name == "cosh" && TLI->has(LibFunc_cosh)) ||
- (Name == "coshf" && TLI->has(LibFunc_coshf)))
+ (Name == "coshf" && TLI->has(LibFunc_coshf)) ||
+ (Name == "__cosh_finite" && TLI->has(LibFunc_cosh_finite)) ||
+ (Name == "__coshf_finite" && TLI->has(LibFunc_coshf_finite)))
return ConstantFoldFP(cosh, V, Ty);
break;
case 'e':
if ((Name == "exp" && TLI->has(LibFunc_exp)) ||
- (Name == "expf" && TLI->has(LibFunc_expf)))
+ (Name == "expf" && TLI->has(LibFunc_expf)) ||
+ (Name == "__exp_finite" && TLI->has(LibFunc_exp_finite)) ||
+ (Name == "__expf_finite" && TLI->has(LibFunc_expf_finite)))
return ConstantFoldFP(exp, V, Ty);
if ((Name == "exp2" && TLI->has(LibFunc_exp2)) ||
- (Name == "exp2f" && TLI->has(LibFunc_exp2f)))
+ (Name == "exp2f" && TLI->has(LibFunc_exp2f)) ||
+ (Name == "__exp2_finite" && TLI->has(LibFunc_exp2_finite)) ||
+ (Name == "__exp2f_finite" && TLI->has(LibFunc_exp2f_finite)))
// Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
// C99 library.
return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
@@ -1680,10 +1724,18 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
break;
case 'l':
if ((Name == "log" && V > 0 && TLI->has(LibFunc_log)) ||
- (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)))
+ (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)) ||
+ (Name == "__log_finite" && V > 0 &&
+ TLI->has(LibFunc_log_finite)) ||
+ (Name == "__logf_finite" && V > 0 &&
+ TLI->has(LibFunc_logf_finite)))
return ConstantFoldFP(log, V, Ty);
else if ((Name == "log10" && V > 0 && TLI->has(LibFunc_log10)) ||
- (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)))
+ (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)) ||
+ (Name == "__log10_finite" && V > 0 &&
+ TLI->has(LibFunc_log10_finite)) ||
+ (Name == "__log10f_finite" && V > 0 &&
+ TLI->has(LibFunc_log10f_finite)))
return ConstantFoldFP(log10, V, Ty);
break;
case 'r':
@@ -1695,7 +1747,9 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
(Name == "sinf" && TLI->has(LibFunc_sinf)))
return ConstantFoldFP(sin, V, Ty);
else if ((Name == "sinh" && TLI->has(LibFunc_sinh)) ||
- (Name == "sinhf" && TLI->has(LibFunc_sinhf)))
+ (Name == "sinhf" && TLI->has(LibFunc_sinhf)) ||
+ (Name == "__sinh_finite" && TLI->has(LibFunc_sinh_finite)) ||
+ (Name == "__sinhf_finite" && TLI->has(LibFunc_sinhf_finite)))
return ConstantFoldFP(sinh, V, Ty);
else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc_sqrt)) ||
(Name == "sqrtf" && V >= 0 && TLI->has(LibFunc_sqrtf)))
@@ -1813,13 +1867,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
if (!TLI)
return nullptr;
if ((Name == "pow" && TLI->has(LibFunc_pow)) ||
- (Name == "powf" && TLI->has(LibFunc_powf)))
+ (Name == "powf" && TLI->has(LibFunc_powf)) ||
+ (Name == "__pow_finite" && TLI->has(LibFunc_pow_finite)) ||
+ (Name == "__powf_finite" && TLI->has(LibFunc_powf_finite)))
return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
if ((Name == "fmod" && TLI->has(LibFunc_fmod)) ||
(Name == "fmodf" && TLI->has(LibFunc_fmodf)))
return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty);
if ((Name == "atan2" && TLI->has(LibFunc_atan2)) ||
- (Name == "atan2f" && TLI->has(LibFunc_atan2f)))
+ (Name == "atan2f" && TLI->has(LibFunc_atan2f)) ||
+ (Name == "__atan2_finite" && TLI->has(LibFunc_atan2_finite)) ||
+ (Name == "__atan2f_finite" && TLI->has(LibFunc_atan2f_finite)))
return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
} else if (auto *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy())
diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp
index 9f5dc5318239..8f808f3e7871 100644
--- a/lib/Analysis/DemandedBits.cpp
+++ b/lib/Analysis/DemandedBits.cpp
@@ -86,13 +86,11 @@ void DemandedBits::determineLiveOperandBits(
[&](unsigned BitWidth, const Value *V1, const Value *V2) {
const DataLayout &DL = I->getModule()->getDataLayout();
Known = KnownBits(BitWidth);
- computeKnownBits(const_cast<Value *>(V1), Known, DL, 0,
- &AC, UserI, &DT);
+ computeKnownBits(V1, Known, DL, 0, &AC, UserI, &DT);
if (V2) {
Known2 = KnownBits(BitWidth);
- computeKnownBits(const_cast<Value *>(V2), Known2, DL,
- 0, &AC, UserI, &DT);
+ computeKnownBits(V2, Known2, DL, 0, &AC, UserI, &DT);
}
};
@@ -118,7 +116,7 @@ void DemandedBits::determineLiveOperandBits(
// known to be one.
ComputeKnownBits(BitWidth, I, nullptr);
AB = APInt::getHighBitsSet(BitWidth,
- std::min(BitWidth, Known.One.countLeadingZeros()+1));
+ std::min(BitWidth, Known.countMaxLeadingZeros()+1));
}
break;
case Intrinsic::cttz:
@@ -128,7 +126,7 @@ void DemandedBits::determineLiveOperandBits(
// known to be one.
ComputeKnownBits(BitWidth, I, nullptr);
AB = APInt::getLowBitsSet(BitWidth,
- std::min(BitWidth, Known.One.countTrailingZeros()+1));
+ std::min(BitWidth, Known.countMaxTrailingZeros()+1));
}
break;
}
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 100a591e452c..44c14cb17c22 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -63,7 +63,7 @@ static cl::opt<bool>
// PGO before we actually hook up inliner with analysis passes such as BPI and
// BFI.
static cl::opt<int> ColdThreshold(
- "inlinecold-threshold", cl::Hidden, cl::init(225),
+ "inlinecold-threshold", cl::Hidden, cl::init(45),
cl::desc("Threshold for inlining functions with cold attribute"));
static cl::opt<int>
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 4a713f441ce8..5728887cc1e9 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1317,7 +1317,7 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
// If all valid bits in the shift amount are known zero, the first operand is
// unchanged.
unsigned NumValidShiftBits = Log2_32_Ceil(BitWidth);
- if (Known.Zero.countTrailingOnes() >= NumValidShiftBits)
+ if (Known.countMinTrailingZeros() >= NumValidShiftBits)
return Op0;
return nullptr;
@@ -1536,7 +1536,7 @@ static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1,
auto Range0 = ConstantRange::makeExactICmpRegion(Cmp0->getPredicate(), *C0);
auto Range1 = ConstantRange::makeExactICmpRegion(Cmp1->getPredicate(), *C1);
- // For and-of-comapares, check if the intersection is empty:
+ // For and-of-compares, check if the intersection is empty:
// (icmp X, C0) && (icmp X, C1) --> empty set --> false
if (IsAnd && Range0.intersectWith(Range1).isEmptySet())
return getFalse(Cmp0->getType());
@@ -1870,6 +1870,24 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
return Op0;
+ // (A & B) | (~A ^ B) -> (~A ^ B)
+ // (B & A) | (~A ^ B) -> (~A ^ B)
+ // (A & B) | (B ^ ~A) -> (B ^ ~A)
+ // (B & A) | (B ^ ~A) -> (B ^ ~A)
+ if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+ (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
+ match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
+ return Op1;
+
+ // (~A ^ B) | (A & B) -> (~A ^ B)
+ // (~A ^ B) | (B & A) -> (~A ^ B)
+ // (B ^ ~A) | (A & B) -> (B ^ ~A)
+ // (B ^ ~A) | (B & A) -> (B ^ ~A)
+ if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
+ (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
+ match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
+ return Op0;
+
if (Value *V = simplifyAndOrOfICmps(Op0, Op1, false))
return V;
@@ -2286,7 +2304,6 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
return nullptr;
Type *ITy = GetCompareTy(LHS); // The return type.
- bool LHSKnownNonNegative, LHSKnownNegative;
switch (Pred) {
default:
llvm_unreachable("Unknown ICmp predicate!");
@@ -2304,39 +2321,41 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
return getTrue(ITy);
break;
- case ICmpInst::ICMP_SLT:
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (LHSKnownNegative)
+ case ICmpInst::ICMP_SLT: {
+ KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (LHSKnown.isNegative())
return getTrue(ITy);
- if (LHSKnownNonNegative)
+ if (LHSKnown.isNonNegative())
return getFalse(ITy);
break;
- case ICmpInst::ICMP_SLE:
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (LHSKnownNegative)
+ }
+ case ICmpInst::ICMP_SLE: {
+ KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (LHSKnown.isNegative())
return getTrue(ITy);
- if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+ if (LHSKnown.isNonNegative() &&
+ isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
return getFalse(ITy);
break;
- case ICmpInst::ICMP_SGE:
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (LHSKnownNegative)
+ }
+ case ICmpInst::ICMP_SGE: {
+ KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (LHSKnown.isNegative())
return getFalse(ITy);
- if (LHSKnownNonNegative)
+ if (LHSKnown.isNonNegative())
return getTrue(ITy);
break;
- case ICmpInst::ICMP_SGT:
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (LHSKnownNegative)
+ }
+ case ICmpInst::ICMP_SGT: {
+ KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (LHSKnown.isNegative())
return getFalse(ITy);
- if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+ if (LHSKnown.isNonNegative() &&
+ isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
return getTrue(ITy);
break;
}
+ }
return nullptr;
}
@@ -2535,6 +2554,9 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
return nullptr;
}
+/// TODO: A large part of this logic is duplicated in InstCombine's
+/// foldICmpBinOp(). We should be able to share that and avoid the code
+/// duplication.
static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
Value *RHS, const SimplifyQuery &Q,
unsigned MaxRecurse) {
@@ -2616,15 +2638,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
return getTrue(ITy);
if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) {
- bool RHSKnownNonNegative, RHSKnownNegative;
- bool YKnownNonNegative, YKnownNegative;
- ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, Q.DL, 0,
- Q.AC, Q.CxtI, Q.DT);
- ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (RHSKnownNonNegative && YKnownNegative)
+ KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (RHSKnown.isNonNegative() && YKnown.isNegative())
return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
- if (RHSKnownNegative || YKnownNonNegative)
+ if (RHSKnown.isNegative() || YKnown.isNonNegative())
return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
}
}
@@ -2636,15 +2654,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
return getFalse(ITy);
if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) {
- bool LHSKnownNonNegative, LHSKnownNegative;
- bool YKnownNonNegative, YKnownNegative;
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0,
- Q.AC, Q.CxtI, Q.DT);
- ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (LHSKnownNonNegative && YKnownNegative)
+ KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (LHSKnown.isNonNegative() && YKnown.isNegative())
return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy);
- if (LHSKnownNegative || YKnownNonNegative)
+ if (LHSKnown.isNegative() || YKnown.isNonNegative())
return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy);
}
}
@@ -2691,28 +2705,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
// icmp pred (urem X, Y), Y
if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
- bool KnownNonNegative, KnownNegative;
switch (Pred) {
default:
break;
case ICmpInst::ICMP_SGT:
- case ICmpInst::ICMP_SGE:
- ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (!KnownNonNegative)
+ case ICmpInst::ICMP_SGE: {
+ KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (!Known.isNonNegative())
break;
LLVM_FALLTHROUGH;
+ }
case ICmpInst::ICMP_EQ:
case ICmpInst::ICMP_UGT:
case ICmpInst::ICMP_UGE:
return getFalse(ITy);
case ICmpInst::ICMP_SLT:
- case ICmpInst::ICMP_SLE:
- ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (!KnownNonNegative)
+ case ICmpInst::ICMP_SLE: {
+ KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (!Known.isNonNegative())
break;
LLVM_FALLTHROUGH;
+ }
case ICmpInst::ICMP_NE:
case ICmpInst::ICMP_ULT:
case ICmpInst::ICMP_ULE:
@@ -2722,28 +2735,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
// icmp pred X, (urem Y, X)
if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
- bool KnownNonNegative, KnownNegative;
switch (Pred) {
default:
break;
case ICmpInst::ICMP_SGT:
- case ICmpInst::ICMP_SGE:
- ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (!KnownNonNegative)
+ case ICmpInst::ICMP_SGE: {
+ KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (!Known.isNonNegative())
break;
LLVM_FALLTHROUGH;
+ }
case ICmpInst::ICMP_NE:
case ICmpInst::ICMP_UGT:
case ICmpInst::ICMP_UGE:
return getTrue(ITy);
case ICmpInst::ICMP_SLT:
- case ICmpInst::ICMP_SLE:
- ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
- Q.CxtI, Q.DT);
- if (!KnownNonNegative)
+ case ICmpInst::ICMP_SLE: {
+ KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+ if (!Known.isNonNegative())
break;
LLVM_FALLTHROUGH;
+ }
case ICmpInst::ICMP_EQ:
case ICmpInst::ICMP_ULT:
case ICmpInst::ICMP_ULE:
@@ -2815,10 +2827,19 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
break;
case Instruction::UDiv:
case Instruction::LShr:
- if (ICmpInst::isSigned(Pred))
+ if (ICmpInst::isSigned(Pred) || !LBO->isExact() || !RBO->isExact())
break;
- LLVM_FALLTHROUGH;
+ if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+ RBO->getOperand(0), Q, MaxRecurse - 1))
+ return V;
+ break;
case Instruction::SDiv:
+ if (!ICmpInst::isEquality(Pred) || !LBO->isExact() || !RBO->isExact())
+ break;
+ if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+ RBO->getOperand(0), Q, MaxRecurse - 1))
+ return V;
+ break;
case Instruction::AShr:
if (!LBO->isExact() || !RBO->isExact())
break;
@@ -4034,24 +4055,21 @@ Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
/// match a root vector source operand that contains that element in the same
/// vector lane (ie, the same mask index), so we can eliminate the shuffle(s).
static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
- Constant *Mask, Value *RootVec, int RootElt,
+ int MaskVal, Value *RootVec,
unsigned MaxRecurse) {
if (!MaxRecurse--)
return nullptr;
// Bail out if any mask value is undefined. That kind of shuffle may be
// simplified further based on demanded bits or other folds.
- int MaskVal = ShuffleVectorInst::getMaskValue(Mask, RootElt);
if (MaskVal == -1)
return nullptr;
// The mask value chooses which source operand we need to look at next.
- Value *SourceOp;
int InVecNumElts = Op0->getType()->getVectorNumElements();
- if (MaskVal < InVecNumElts) {
- RootElt = MaskVal;
- SourceOp = Op0;
- } else {
+ int RootElt = MaskVal;
+ Value *SourceOp = Op0;
+ if (MaskVal >= InVecNumElts) {
RootElt = MaskVal - InVecNumElts;
SourceOp = Op1;
}
@@ -4061,7 +4079,7 @@ static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
if (auto *SourceShuf = dyn_cast<ShuffleVectorInst>(SourceOp)) {
return foldIdentityShuffles(
DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1),
- SourceShuf->getMask(), RootVec, RootElt, MaxRecurse);
+ SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse);
}
// TODO: Look through bitcasts? What if the bitcast changes the vector element
@@ -4126,17 +4144,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
// second one.
if (Op0Const && !Op1Const) {
std::swap(Op0, Op1);
- for (int &Idx : Indices) {
- if (Idx == -1)
- continue;
- Idx = Idx < (int)InVecNumElts ? Idx + InVecNumElts : Idx - InVecNumElts;
- assert(Idx >= 0 && Idx < (int)InVecNumElts * 2 &&
- "shufflevector mask index out of range");
- }
- Mask = ConstantDataVector::get(
- Mask->getContext(),
- makeArrayRef(reinterpret_cast<uint32_t *>(Indices.data()),
- MaskNumElts));
+ ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts);
}
// A shuffle of a splat is always the splat itself. Legal if the shuffle's
@@ -4160,7 +4168,8 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
for (unsigned i = 0; i != MaskNumElts; ++i) {
// Note that recursion is limited for each vector element, so if any element
// exceeds the limit, this will fail to simplify.
- RootVec = foldIdentityShuffles(i, Op0, Op1, Mask, RootVec, i, MaxRecurse);
+ RootVec =
+ foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse);
// We can't replace a widening/narrowing shuffle with one of its operands.
if (!RootVec || RootVec->getType() != RetTy)
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index 99f900ae3932..26706f5509ba 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -232,7 +232,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
}
// We should have named any anonymous globals
assert(CalledFunction->hasName());
- auto ScaledCount = ProfileSummaryInfo::getProfileCount(&I, BFI);
+ auto ScaledCount = PSI->getProfileCount(&I, BFI);
auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
: CalleeInfo::HotnessType::Unknown;
@@ -330,6 +330,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
const Module &M,
std::function<BlockFrequencyInfo *(const Function &F)> GetBFICallback,
ProfileSummaryInfo *PSI) {
+ assert(PSI);
ModuleSummaryIndex Index;
// Identify the local values in the llvm.used and llvm.compiler.used sets,
diff --git a/lib/Analysis/OptimizationDiagnosticInfo.cpp b/lib/Analysis/OptimizationDiagnosticInfo.cpp
index 73245981b022..e38e530c052d 100644
--- a/lib/Analysis/OptimizationDiagnosticInfo.cpp
+++ b/lib/Analysis/OptimizationDiagnosticInfo.cpp
@@ -101,7 +101,7 @@ void MappingTraits<DiagnosticInfoOptimizationBase *>::mapping(
// These are read-only for now.
DiagnosticLocation DL = OptDiag->getLocation();
StringRef FN =
- GlobalValue::getRealLinkageName(OptDiag->getFunction().getName());
+ GlobalValue::dropLLVMManglingEscape(OptDiag->getFunction().getName());
StringRef PassName(OptDiag->PassName);
io.mapRequired("Pass", PassName);
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index 1a53a8ed4283..502f4205b689 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -75,11 +75,14 @@ ProfileSummaryInfo::getProfileCount(const Instruction *Inst,
return None;
assert((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
"We can only get profile count for call/invoke instruction.");
- // Check if there is a profile metadata on the instruction. If it is present,
- // determine hotness solely based on that.
- uint64_t TotalCount;
- if (Inst->extractProfTotalWeight(TotalCount))
- return TotalCount;
+ if (computeSummary() && Summary->getKind() == ProfileSummary::PSK_Sample) {
+ // In sample PGO mode, check if there is a profile metadata on the
+ // instruction. If it is present, determine hotness solely based on that,
+ // since the sampled entry count may not be accurate.
+ uint64_t TotalCount;
+ if (Inst->extractProfTotalWeight(TotalCount))
+ return TotalCount;
+ }
if (BFI)
return BFI->getBlockProfileCount(Inst->getParent());
return None;
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 01dca0793145..800354d2f5b4 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -584,7 +584,7 @@ CompareValueComplexity(SmallSet<std::pair<Value *, Value *>, 8> &EqCache,
static int CompareSCEVComplexity(
SmallSet<std::pair<const SCEV *, const SCEV *>, 8> &EqCacheSCEV,
const LoopInfo *const LI, const SCEV *LHS, const SCEV *RHS,
- unsigned Depth = 0) {
+ DominatorTree &DT, unsigned Depth = 0) {
// Fast-path: SCEVs are uniqued so we can do a quick equality check.
if (LHS == RHS)
return 0;
@@ -629,9 +629,16 @@ static int CompareSCEVComplexity(
const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
- // Compare addrec loop depths.
+ // If there is a dominance relationship between the loops, sort by the
+ // dominance. Otherwise, sort by depth. We require such order in getAddExpr.
const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
if (LLoop != RLoop) {
+ const BasicBlock *LHead = LLoop->getHeader(), *RHead = RLoop->getHeader();
+ assert(LHead != RHead && "Two loops share the same header?");
+ if (DT.dominates(LHead, RHead))
+ return 1;
+ else if (DT.dominates(RHead, LHead))
+ return -1;
unsigned LDepth = LLoop->getLoopDepth(), RDepth = RLoop->getLoopDepth();
if (LDepth != RDepth)
return (int)LDepth - (int)RDepth;
@@ -645,7 +652,7 @@ static int CompareSCEVComplexity(
// Lexicographically compare.
for (unsigned i = 0; i != LNumOps; ++i) {
int X = CompareSCEVComplexity(EqCacheSCEV, LI, LA->getOperand(i),
- RA->getOperand(i), Depth + 1);
+ RA->getOperand(i), DT, Depth + 1);
if (X != 0)
return X;
}
@@ -669,7 +676,7 @@ static int CompareSCEVComplexity(
if (i >= RNumOps)
return 1;
int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(i),
- RC->getOperand(i), Depth + 1);
+ RC->getOperand(i), DT, Depth + 1);
if (X != 0)
return X;
}
@@ -683,10 +690,10 @@ static int CompareSCEVComplexity(
// Lexicographically compare udiv expressions.
int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getLHS(), RC->getLHS(),
- Depth + 1);
+ DT, Depth + 1);
if (X != 0)
return X;
- X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(),
+ X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(), DT,
Depth + 1);
if (X == 0)
EqCacheSCEV.insert({LHS, RHS});
@@ -701,7 +708,7 @@ static int CompareSCEVComplexity(
// Compare cast expressions by operand.
int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(),
- RC->getOperand(), Depth + 1);
+ RC->getOperand(), DT, Depth + 1);
if (X == 0)
EqCacheSCEV.insert({LHS, RHS});
return X;
@@ -724,7 +731,7 @@ static int CompareSCEVComplexity(
/// land in memory.
///
static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
- LoopInfo *LI) {
+ LoopInfo *LI, DominatorTree &DT) {
if (Ops.size() < 2) return; // Noop
SmallSet<std::pair<const SCEV *, const SCEV *>, 8> EqCache;
@@ -732,15 +739,16 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
// This is the common case, which also happens to be trivially simple.
// Special case it.
const SCEV *&LHS = Ops[0], *&RHS = Ops[1];
- if (CompareSCEVComplexity(EqCache, LI, RHS, LHS) < 0)
+ if (CompareSCEVComplexity(EqCache, LI, RHS, LHS, DT) < 0)
std::swap(LHS, RHS);
return;
}
// Do the rough sort by complexity.
std::stable_sort(Ops.begin(), Ops.end(),
- [&EqCache, LI](const SCEV *LHS, const SCEV *RHS) {
- return CompareSCEVComplexity(EqCache, LI, LHS, RHS) < 0;
+ [&EqCache, LI, &DT](const SCEV *LHS, const SCEV *RHS) {
+ return
+ CompareSCEVComplexity(EqCache, LI, LHS, RHS, DT) < 0;
});
// Now that we are sorted by complexity, group elements of the same
@@ -2186,7 +2194,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
#endif
// Sort by complexity, this groups all similar expression types together.
- GroupByComplexity(Ops, &LI);
+ GroupByComplexity(Ops, &LI, DT);
Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags);
@@ -2492,7 +2500,13 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
// added together. If so, we can fold them.
for (unsigned OtherIdx = Idx+1;
OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
- ++OtherIdx)
+ ++OtherIdx) {
+ // We expect the AddRecExpr's to be sorted in reverse dominance order,
+ // so that the 1st found AddRecExpr is dominated by all others.
+ assert(DT.dominates(
+ cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()->getHeader(),
+ AddRec->getLoop()->getHeader()) &&
+ "AddRecExprs are not sorted in reverse dominance order?");
if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
// Other + {A,+,B}<L> + {C,+,D}<L> --> Other + {A+C,+,B+D}<L>
SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
@@ -2518,6 +2532,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap);
return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
}
+ }
// Otherwise couldn't fold anything into this recurrence. Move onto the
// next one.
@@ -2614,7 +2629,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
#endif
// Sort by complexity, this groups all similar expression types together.
- GroupByComplexity(Ops, &LI);
+ GroupByComplexity(Ops, &LI, DT);
Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);
@@ -3211,7 +3226,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
#endif
// Sort by complexity, this groups all similar expression types together.
- GroupByComplexity(Ops, &LI);
+ GroupByComplexity(Ops, &LI, DT);
// If there are any constants, fold them together.
unsigned Idx = 0;
@@ -3312,7 +3327,7 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
#endif
// Sort by complexity, this groups all similar expression types together.
- GroupByComplexity(Ops, &LI);
+ GroupByComplexity(Ops, &LI, DT);
// If there are any constants, fold them together.
unsigned Idx = 0;
@@ -4636,7 +4651,7 @@ uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) {
KnownBits Known(BitWidth);
computeKnownBits(U->getValue(), Known, getDataLayout(), 0, &AC,
nullptr, &DT);
- return Known.Zero.countTrailingOnes();
+ return Known.countMinTrailingZeros();
}
// SCEVUDivExpr
@@ -5955,6 +5970,30 @@ bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
return false;
}
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E)
+ : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {}
+
+ScalarEvolution::ExitLimit::ExitLimit(
+ const SCEV *E, const SCEV *M, bool MaxOrZero,
+ ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList)
+ : ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) {
+ assert((isa<SCEVCouldNotCompute>(ExactNotTaken) ||
+ !isa<SCEVCouldNotCompute>(MaxNotTaken)) &&
+ "Exact is not allowed to be less precise than Max");
+ for (auto *PredSet : PredSetList)
+ for (auto *P : *PredSet)
+ addPredicate(P);
+}
+
+ScalarEvolution::ExitLimit::ExitLimit(
+ const SCEV *E, const SCEV *M, bool MaxOrZero,
+ const SmallPtrSetImpl<const SCEVPredicate *> &PredSet)
+ : ExitLimit(E, M, MaxOrZero, {&PredSet}) {}
+
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, const SCEV *M,
+ bool MaxOrZero)
+ : ExitLimit(E, M, MaxOrZero, None) {}
+
/// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
/// computable exit into a persistent ExitNotTakenInfo array.
ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
@@ -6637,13 +6676,12 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit(
// {K,ashr,<positive-constant>} stabilizes to signum(K) in at most
// bitwidth(K) iterations.
Value *FirstValue = PN->getIncomingValueForBlock(Predecessor);
- bool KnownZero, KnownOne;
- ComputeSignBit(FirstValue, KnownZero, KnownOne, DL, 0, nullptr,
- Predecessor->getTerminator(), &DT);
+ KnownBits Known = computeKnownBits(FirstValue, DL, 0, nullptr,
+ Predecessor->getTerminator(), &DT);
auto *Ty = cast<IntegerType>(RHS->getType());
- if (KnownZero)
+ if (Known.isNonNegative())
StableValue = ConstantInt::get(Ty, 0);
- else if (KnownOne)
+ else if (Known.isNegative())
StableValue = ConstantInt::get(Ty, -1, true);
else
return getCouldNotCompute();
@@ -7377,48 +7415,49 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
const APInt &N = NC->getAPInt();
APInt Two(BitWidth, 2);
- {
- using namespace APIntOps;
- const APInt& C = L;
- // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
- // The B coefficient is M-N/2
- APInt B(M);
- B -= N.sdiv(Two);
+ // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
- // The A coefficient is N/2
- APInt A(N.sdiv(Two));
+ // The A coefficient is N/2
+ APInt A = N.sdiv(Two);
- // Compute the B^2-4ac term.
- APInt SqrtTerm(B);
- SqrtTerm *= B;
- SqrtTerm -= 4 * (A * C);
+ // The B coefficient is M-N/2
+ APInt B = M;
+ B -= A; // A is the same as N/2.
- if (SqrtTerm.isNegative()) {
- // The loop is provably infinite.
- return None;
- }
+ // The C coefficient is L.
+ const APInt& C = L;
- // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
- // integer value or else APInt::sqrt() will assert.
- APInt SqrtVal(SqrtTerm.sqrt());
+ // Compute the B^2-4ac term.
+ APInt SqrtTerm = B;
+ SqrtTerm *= B;
+ SqrtTerm -= 4 * (A * C);
- // Compute the two solutions for the quadratic formula.
- // The divisions must be performed as signed divisions.
- APInt NegB(-B);
- APInt TwoA(A << 1);
- if (TwoA.isMinValue())
- return None;
+ if (SqrtTerm.isNegative()) {
+ // The loop is provably infinite.
+ return None;
+ }
+
+ // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
+ // integer value or else APInt::sqrt() will assert.
+ APInt SqrtVal = SqrtTerm.sqrt();
+
+ // Compute the two solutions for the quadratic formula.
+ // The divisions must be performed as signed divisions.
+ APInt NegB = -std::move(B);
+ APInt TwoA = std::move(A);
+ TwoA <<= 1;
+ if (TwoA.isNullValue())
+ return None;
- LLVMContext &Context = SE.getContext();
+ LLVMContext &Context = SE.getContext();
- ConstantInt *Solution1 =
- ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
- ConstantInt *Solution2 =
- ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
+ ConstantInt *Solution1 =
+ ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
+ ConstantInt *Solution2 =
+ ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
- return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
- cast<SCEVConstant>(SE.getConstant(Solution2)));
- } // end APIntOps namespace
+ return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
+ cast<SCEVConstant>(SE.getConstant(Solution2)));
}
ScalarEvolution::ExitLimit
@@ -8976,7 +9015,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
.getSignedMax();
// SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow!
- return (std::move(MaxValue) - std::move(MaxStrideMinusOne)).slt(MaxRHS);
+ return (std::move(MaxValue) - MaxStrideMinusOne).slt(MaxRHS);
}
APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax();
@@ -8985,7 +9024,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
.getUnsignedMax();
// UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow!
- return (std::move(MaxValue) - std::move(MaxStrideMinusOne)).ult(MaxRHS);
+ return (std::move(MaxValue) - MaxStrideMinusOne).ult(MaxRHS);
}
bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
@@ -9002,7 +9041,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
.getSignedMax();
// SMinRHS - SMaxStrideMinusOne < SMinValue => overflow!
- return (std::move(MinValue) + std::move(MaxStrideMinusOne)).sgt(MinRHS);
+ return (std::move(MinValue) + MaxStrideMinusOne).sgt(MinRHS);
}
APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin();
@@ -9011,7 +9050,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
.getUnsignedMax();
// UMinRHS - UMaxStrideMinusOne < UMinValue => overflow!
- return (std::move(MinValue) + std::move(MaxStrideMinusOne)).ugt(MinRHS);
+ return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS);
}
const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step,
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index 848e1b4717b5..3cf1bbc5daa5 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -241,6 +241,50 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
TLI.setUnavailable(LibFunc_tanhf);
}
+ // These definitions are due to math-finite.h header on Linux
+ TLI.setUnavailable(LibFunc_acos_finite);
+ TLI.setUnavailable(LibFunc_acosf_finite);
+ TLI.setUnavailable(LibFunc_acosl_finite);
+ TLI.setUnavailable(LibFunc_acosh_finite);
+ TLI.setUnavailable(LibFunc_acoshf_finite);
+ TLI.setUnavailable(LibFunc_acoshl_finite);
+ TLI.setUnavailable(LibFunc_asin_finite);
+ TLI.setUnavailable(LibFunc_asinf_finite);
+ TLI.setUnavailable(LibFunc_asinl_finite);
+ TLI.setUnavailable(LibFunc_atan2_finite);
+ TLI.setUnavailable(LibFunc_atan2f_finite);
+ TLI.setUnavailable(LibFunc_atan2l_finite);
+ TLI.setUnavailable(LibFunc_atanh_finite);
+ TLI.setUnavailable(LibFunc_atanhf_finite);
+ TLI.setUnavailable(LibFunc_atanhl_finite);
+ TLI.setUnavailable(LibFunc_cosh_finite);
+ TLI.setUnavailable(LibFunc_coshf_finite);
+ TLI.setUnavailable(LibFunc_coshl_finite);
+ TLI.setUnavailable(LibFunc_exp10_finite);
+ TLI.setUnavailable(LibFunc_exp10f_finite);
+ TLI.setUnavailable(LibFunc_exp10l_finite);
+ TLI.setUnavailable(LibFunc_exp2_finite);
+ TLI.setUnavailable(LibFunc_exp2f_finite);
+ TLI.setUnavailable(LibFunc_exp2l_finite);
+ TLI.setUnavailable(LibFunc_exp_finite);
+ TLI.setUnavailable(LibFunc_expf_finite);
+ TLI.setUnavailable(LibFunc_expl_finite);
+ TLI.setUnavailable(LibFunc_log10_finite);
+ TLI.setUnavailable(LibFunc_log10f_finite);
+ TLI.setUnavailable(LibFunc_log10l_finite);
+ TLI.setUnavailable(LibFunc_log2_finite);
+ TLI.setUnavailable(LibFunc_log2f_finite);
+ TLI.setUnavailable(LibFunc_log2l_finite);
+ TLI.setUnavailable(LibFunc_log_finite);
+ TLI.setUnavailable(LibFunc_logf_finite);
+ TLI.setUnavailable(LibFunc_logl_finite);
+ TLI.setUnavailable(LibFunc_pow_finite);
+ TLI.setUnavailable(LibFunc_powf_finite);
+ TLI.setUnavailable(LibFunc_powl_finite);
+ TLI.setUnavailable(LibFunc_sinh_finite);
+ TLI.setUnavailable(LibFunc_sinhf_finite);
+ TLI.setUnavailable(LibFunc_sinhl_finite);
+
// Win32 does *not* provide provide these functions, but they are
// generally available on POSIX-compliant systems:
TLI.setUnavailable(LibFunc_access);
@@ -496,7 +540,7 @@ static StringRef sanitizeFunctionName(StringRef funcName) {
// Check for \01 prefix that is used to mangle __asm declarations and
// strip it if present.
- return GlobalValue::getRealLinkageName(funcName);
+ return GlobalValue::dropLLVMManglingEscape(funcName);
}
bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
@@ -1004,22 +1048,34 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
return (NumParams == 1 && FTy.getParamType(0)->isFloatingPointTy());
case LibFunc_acos:
+ case LibFunc_acos_finite:
case LibFunc_acosf:
+ case LibFunc_acosf_finite:
case LibFunc_acosh:
+ case LibFunc_acosh_finite:
case LibFunc_acoshf:
+ case LibFunc_acoshf_finite:
case LibFunc_acoshl:
+ case LibFunc_acoshl_finite:
case LibFunc_acosl:
+ case LibFunc_acosl_finite:
case LibFunc_asin:
+ case LibFunc_asin_finite:
case LibFunc_asinf:
+ case LibFunc_asinf_finite:
case LibFunc_asinh:
case LibFunc_asinhf:
case LibFunc_asinhl:
case LibFunc_asinl:
+ case LibFunc_asinl_finite:
case LibFunc_atan:
case LibFunc_atanf:
case LibFunc_atanh:
+ case LibFunc_atanh_finite:
case LibFunc_atanhf:
+ case LibFunc_atanhf_finite:
case LibFunc_atanhl:
+ case LibFunc_atanhl_finite:
case LibFunc_atanl:
case LibFunc_cbrt:
case LibFunc_cbrtf:
@@ -1030,18 +1086,30 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
case LibFunc_cos:
case LibFunc_cosf:
case LibFunc_cosh:
+ case LibFunc_cosh_finite:
case LibFunc_coshf:
+ case LibFunc_coshf_finite:
case LibFunc_coshl:
+ case LibFunc_coshl_finite:
case LibFunc_cosl:
case LibFunc_exp10:
+ case LibFunc_exp10_finite:
case LibFunc_exp10f:
+ case LibFunc_exp10f_finite:
case LibFunc_exp10l:
+ case LibFunc_exp10l_finite:
case LibFunc_exp2:
+ case LibFunc_exp2_finite:
case LibFunc_exp2f:
+ case LibFunc_exp2f_finite:
case LibFunc_exp2l:
+ case LibFunc_exp2l_finite:
case LibFunc_exp:
+ case LibFunc_exp_finite:
case LibFunc_expf:
+ case LibFunc_expf_finite:
case LibFunc_expl:
+ case LibFunc_expl_finite:
case LibFunc_expm1:
case LibFunc_expm1f:
case LibFunc_expm1l:
@@ -1052,20 +1120,29 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
case LibFunc_floorf:
case LibFunc_floorl:
case LibFunc_log10:
+ case LibFunc_log10_finite:
case LibFunc_log10f:
+ case LibFunc_log10f_finite:
case LibFunc_log10l:
+ case LibFunc_log10l_finite:
case LibFunc_log1p:
case LibFunc_log1pf:
case LibFunc_log1pl:
case LibFunc_log2:
+ case LibFunc_log2_finite:
case LibFunc_log2f:
+ case LibFunc_log2f_finite:
case LibFunc_log2l:
+ case LibFunc_log2l_finite:
case LibFunc_log:
+ case LibFunc_log_finite:
case LibFunc_logb:
case LibFunc_logbf:
case LibFunc_logbl:
case LibFunc_logf:
+ case LibFunc_logf_finite:
case LibFunc_logl:
+ case LibFunc_logl_finite:
case LibFunc_nearbyint:
case LibFunc_nearbyintf:
case LibFunc_nearbyintl:
@@ -1078,8 +1155,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
case LibFunc_sin:
case LibFunc_sinf:
case LibFunc_sinh:
+ case LibFunc_sinh_finite:
case LibFunc_sinhf:
+ case LibFunc_sinhf_finite:
case LibFunc_sinhl:
+ case LibFunc_sinhl_finite:
case LibFunc_sinl:
case LibFunc_sqrt:
case LibFunc_sqrt_finite:
@@ -1100,8 +1180,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
FTy.getReturnType() == FTy.getParamType(0));
case LibFunc_atan2:
+ case LibFunc_atan2_finite:
case LibFunc_atan2f:
+ case LibFunc_atan2f_finite:
case LibFunc_atan2l:
+ case LibFunc_atan2l_finite:
case LibFunc_fmin:
case LibFunc_fminf:
case LibFunc_fminl:
@@ -1115,8 +1198,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
case LibFunc_copysignf:
case LibFunc_copysignl:
case LibFunc_pow:
+ case LibFunc_pow_finite:
case LibFunc_powf:
+ case LibFunc_powf_finite:
case LibFunc_powl:
+ case LibFunc_powl_finite:
return (NumParams == 2 && FTy.getReturnType()->isFloatingPointTy() &&
FTy.getReturnType() == FTy.getParamType(0) &&
FTy.getReturnType() == FTy.getParamType(1));
@@ -1294,6 +1380,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
{"powf", "__svml_powf8", 8},
{"powf", "__svml_powf16", 16},
+ { "__pow_finite", "__svml_pow2", 2 },
+ { "__pow_finite", "__svml_pow4", 4 },
+ { "__pow_finite", "__svml_pow8", 8 },
+
+ { "__powf_finite", "__svml_powf4", 4 },
+ { "__powf_finite", "__svml_powf8", 8 },
+ { "__powf_finite", "__svml_powf16", 16 },
+
{"llvm.pow.f64", "__svml_pow2", 2},
{"llvm.pow.f64", "__svml_pow4", 4},
{"llvm.pow.f64", "__svml_pow8", 8},
@@ -1310,6 +1404,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
{"expf", "__svml_expf8", 8},
{"expf", "__svml_expf16", 16},
+ { "__exp_finite", "__svml_exp2", 2 },
+ { "__exp_finite", "__svml_exp4", 4 },
+ { "__exp_finite", "__svml_exp8", 8 },
+
+ { "__expf_finite", "__svml_expf4", 4 },
+ { "__expf_finite", "__svml_expf8", 8 },
+ { "__expf_finite", "__svml_expf16", 16 },
+
{"llvm.exp.f64", "__svml_exp2", 2},
{"llvm.exp.f64", "__svml_exp4", 4},
{"llvm.exp.f64", "__svml_exp8", 8},
@@ -1326,6 +1428,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
{"logf", "__svml_logf8", 8},
{"logf", "__svml_logf16", 16},
+ { "__log_finite", "__svml_log2", 2 },
+ { "__log_finite", "__svml_log4", 4 },
+ { "__log_finite", "__svml_log8", 8 },
+
+ { "__logf_finite", "__svml_logf4", 4 },
+ { "__logf_finite", "__svml_logf8", 8 },
+ { "__logf_finite", "__svml_logf16", 16 },
+
{"llvm.log.f64", "__svml_log2", 2},
{"llvm.log.f64", "__svml_log4", 4},
{"llvm.log.f64", "__svml_log8", 8},
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 26d606cce9bb..8a5d10473662 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -279,6 +279,10 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
return TTIImpl->getRegisterBitWidth(Vector);
}
+unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
+ return TTIImpl->getMinVectorRegisterBitWidth();
+}
+
bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
return TTIImpl->shouldConsiderAddressTypePromotion(
@@ -500,6 +504,15 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
}
+bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode,
+ Type *Ty, ReductionFlags Flags) const {
+ return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
+}
+
+bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
+ return TTIImpl->shouldExpandReduction(II);
+}
+
TargetTransformInfo::Concept::~Concept() {}
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index a7f3ff672aef..cba7363a0afa 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -88,9 +88,8 @@ struct Query {
/// classic case of this is assume(x = y), which will attempt to determine
/// bits in x from bits in y, which will attempt to determine bits in y from
/// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
- /// isKnownNonZero, which calls computeKnownBits and ComputeSignBit and
- /// isKnownToBeAPowerOfTwo (all of which can call computeKnownBits), and so
- /// on.
+ /// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo
+ /// (all of which can call computeKnownBits), and so on.
std::array<const Value *, MaxDepth> Excluded;
unsigned NumExcluded;
@@ -143,6 +142,16 @@ void llvm::computeKnownBits(const Value *V, KnownBits &Known,
Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
}
+static KnownBits computeKnownBits(const Value *V, unsigned Depth,
+ const Query &Q);
+
+KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
+ unsigned Depth, AssumptionCache *AC,
+ const Instruction *CxtI,
+ const DominatorTree *DT) {
+ return ::computeKnownBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
+}
+
bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
const DataLayout &DL,
AssumptionCache *AC, const Instruction *CxtI,
@@ -159,16 +168,6 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
return (LHSKnown.Zero | RHSKnown.Zero).isAllOnesValue();
}
-static void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
- unsigned Depth, const Query &Q);
-
-void llvm::ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
- const DataLayout &DL, unsigned Depth,
- AssumptionCache *AC, const Instruction *CxtI,
- const DominatorTree *DT) {
- ::ComputeSignBit(V, KnownZero, KnownOne, Depth,
- Query(DL, AC, safeCxtI(V, CxtI), DT));
-}
static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
const Query &Q);
@@ -194,9 +193,8 @@ bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL,
unsigned Depth,
AssumptionCache *AC, const Instruction *CxtI,
const DominatorTree *DT) {
- bool NonNegative, Negative;
- ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
- return NonNegative;
+ KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
+ return Known.isNonNegative();
}
bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
@@ -214,9 +212,8 @@ bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth,
AssumptionCache *AC, const Instruction *CxtI,
const DominatorTree *DT) {
- bool NonNegative, Negative;
- ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
- return Negative;
+ KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
+ return Known.isNegative();
}
static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q);
@@ -342,10 +339,10 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
// Also compute a conservative estimate for high known-0 bits.
// More trickiness is possible, but this is sufficient for the
// interesting case of alignment computation.
- unsigned TrailZ = Known.Zero.countTrailingOnes() +
- Known2.Zero.countTrailingOnes();
- unsigned LeadZ = std::max(Known.Zero.countLeadingOnes() +
- Known2.Zero.countLeadingOnes(),
+ unsigned TrailZ = Known.countMinTrailingZeros() +
+ Known2.countMinTrailingZeros();
+ unsigned LeadZ = std::max(Known.countMinLeadingZeros() +
+ Known2.countMinLeadingZeros(),
BitWidth) - BitWidth;
TrailZ = std::min(TrailZ, BitWidth);
@@ -750,8 +747,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
// Whatever high bits in c are zero are known to be zero.
- Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes());
- // assume(v <_u c)
+ Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
+ // assume(v <_u c)
} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
Pred == ICmpInst::ICMP_ULT &&
isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
@@ -761,9 +758,9 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
// Whatever high bits in c are zero are known to be zero (if c is a power
// of 2, then one more).
if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
- Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes()+1);
+ Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros() + 1);
else
- Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes());
+ Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
}
}
@@ -916,7 +913,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
m_Value(Y))))) {
Known2.resetAll();
computeKnownBits(Y, Known2, Depth + 1, Q);
- if (Known2.One.countTrailingOnes() > 0)
+ if (Known2.countMinTrailingOnes() > 0)
Known.Zero.setBit(0);
}
break;
@@ -953,14 +950,13 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
// treat a udiv as a logical right shift by the power of 2 known to
// be less than the denominator.
computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
- unsigned LeadZ = Known2.Zero.countLeadingOnes();
+ unsigned LeadZ = Known2.countMinLeadingZeros();
Known2.resetAll();
computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
- unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros();
- if (RHSUnknownLeadingOnes != BitWidth)
- LeadZ = std::min(BitWidth,
- LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+ unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+ if (RHSMaxLeadingZeros != BitWidth)
+ LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
Known.Zero.setHighBits(LeadZ);
break;
@@ -983,8 +979,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
if (Known.isNegative() && Known2.isNegative())
// We can derive a lower bound on the result by taking the max of the
// leading one bits.
- MaxHighOnes = std::max(Known.One.countLeadingOnes(),
- Known2.One.countLeadingOnes());
+ MaxHighOnes =
+ std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
// If either side is non-negative, the result is non-negative.
else if (Known.isNonNegative() || Known2.isNonNegative())
MaxHighZeros = 1;
@@ -993,8 +989,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
if (Known.isNonNegative() && Known2.isNonNegative())
// We can derive an upper bound on the result by taking the max of the
// leading zero bits.
- MaxHighZeros = std::max(Known.Zero.countLeadingOnes(),
- Known2.Zero.countLeadingOnes());
+ MaxHighZeros = std::max(Known.countMinLeadingZeros(),
+ Known2.countMinLeadingZeros());
// If either side is negative, the result is negative.
else if (Known.isNegative() || Known2.isNegative())
MaxHighOnes = 1;
@@ -1002,12 +998,12 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
// We can derive a lower bound on the result by taking the max of the
// leading one bits.
MaxHighOnes =
- std::max(Known.One.countLeadingOnes(), Known2.One.countLeadingOnes());
+ std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
} else if (SPF == SPF_UMIN) {
// We can derive an upper bound on the result by taking the max of the
// leading zero bits.
MaxHighZeros =
- std::max(Known.Zero.countLeadingOnes(), Known2.Zero.countLeadingOnes());
+ std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
}
// Only known if known in both the LHS and RHS.
@@ -1185,8 +1181,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
- unsigned Leaders = std::max(Known.Zero.countLeadingOnes(),
- Known2.Zero.countLeadingOnes());
+ unsigned Leaders =
+ std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
Known.resetAll();
Known.Zero.setHighBits(Leaders);
break;
@@ -1207,7 +1203,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
// to determine if we can prove known low zero bits.
KnownBits LocalKnown(BitWidth);
computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q);
- unsigned TrailZ = LocalKnown.Zero.countTrailingOnes();
+ unsigned TrailZ = LocalKnown.countMinTrailingZeros();
gep_type_iterator GTI = gep_type_begin(I);
for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
@@ -1241,7 +1237,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
computeKnownBits(Index, LocalKnown, Depth + 1, Q);
TrailZ = std::min(TrailZ,
unsigned(countTrailingZeros(TypeSize) +
- LocalKnown.Zero.countTrailingOnes()));
+ LocalKnown.countMinTrailingZeros()));
}
}
@@ -1286,8 +1282,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
KnownBits Known3(Known);
computeKnownBits(L, Known3, Depth + 1, Q);
- Known.Zero.setLowBits(std::min(Known2.Zero.countTrailingOnes(),
- Known3.Zero.countTrailingOnes()));
+ Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(),
+ Known3.countMinTrailingZeros()));
if (DontImproveNonNegativePhiBits)
break;
@@ -1386,12 +1382,25 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
Known.Zero |= Known2.Zero.byteSwap();
Known.One |= Known2.One.byteSwap();
break;
- case Intrinsic::ctlz:
+ case Intrinsic::ctlz: {
+ computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+ // If we have a known 1, its position is our upper bound.
+ unsigned PossibleLZ = Known2.One.countLeadingZeros();
+ // If this call is undefined for 0, the result will be less than 2^n.
+ if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
+ PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
+ unsigned LowBits = Log2_32(PossibleLZ)+1;
+ Known.Zero.setBitsFrom(LowBits);
+ break;
+ }
case Intrinsic::cttz: {
- unsigned LowBits = Log2_32(BitWidth)+1;
+ computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+ // If we have a known 1, its position is our upper bound.
+ unsigned PossibleTZ = Known2.One.countTrailingZeros();
// If this call is undefined for 0, the result will be less than 2^n.
if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
- LowBits -= 1;
+ PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
+ unsigned LowBits = Log2_32(PossibleTZ)+1;
Known.Zero.setBitsFrom(LowBits);
break;
}
@@ -1399,7 +1408,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
// We can bound the space the count needs. Also, bits known to be zero
// can't contribute to the population.
- unsigned BitsPossiblySet = BitWidth - Known2.Zero.countPopulation();
+ unsigned BitsPossiblySet = Known2.countMaxPopulation();
unsigned LowBits = Log2_32(BitsPossiblySet)+1;
Known.Zero.setBitsFrom(LowBits);
// TODO: we could bound KnownOne using the lower bound on the number
@@ -1450,6 +1459,14 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
}
/// Determine which bits of V are known to be either zero or one and return
+/// them.
+KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
+ KnownBits Known(getBitWidth(V->getType(), Q.DL));
+ computeKnownBits(V, Known, Depth, Q);
+ return Known;
+}
+
+/// Determine which bits of V are known to be either zero or one and return
/// them in the Known bit set.
///
/// NOTE: we cannot consider 'undef' to be "IsZero" here. The problem is that
@@ -1568,16 +1585,6 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
}
-/// Determine whether the sign bit is known to be zero or one.
-/// Convenience wrapper around computeKnownBits.
-void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
- unsigned Depth, const Query &Q) {
- KnownBits Bits(getBitWidth(V->getType(), Q.DL));
- computeKnownBits(V, Bits, Depth, Q);
- KnownOne = Bits.isNegative();
- KnownZero = Bits.isNonNegative();
-}
-
/// Return true if the given value is known to have exactly one
/// bit set when defined. For vectors return true if every element is known to
/// be a power of two when defined. Supports values with integer or pointer
@@ -1842,24 +1849,20 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
if (BO->isExact())
return isKnownNonZero(X, Depth, Q);
- bool XKnownNonNegative, XKnownNegative;
- ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
- if (XKnownNegative)
+ KnownBits Known = computeKnownBits(X, Depth, Q);
+ if (Known.isNegative())
return true;
// If the shifter operand is a constant, and all of the bits shifted
// out are known to be zero, and X is known non-zero then at least one
// non-zero bit must remain.
if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
- KnownBits Known(BitWidth);
- computeKnownBits(X, Known, Depth, Q);
-
auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
// Is there a known one in the portion not shifted out?
- if (Known.One.countLeadingZeros() < BitWidth - ShiftVal)
+ if (Known.countMaxLeadingZeros() < BitWidth - ShiftVal)
return true;
// Are all the bits to be shifted out known zero?
- if (Known.Zero.countTrailingOnes() >= ShiftVal)
+ if (Known.countMinTrailingZeros() >= ShiftVal)
return isKnownNonZero(X, Depth, Q);
}
}
@@ -1869,39 +1872,34 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
}
// X + Y.
else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
- bool XKnownNonNegative, XKnownNegative;
- bool YKnownNonNegative, YKnownNegative;
- ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
- ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Depth, Q);
+ KnownBits XKnown = computeKnownBits(X, Depth, Q);
+ KnownBits YKnown = computeKnownBits(Y, Depth, Q);
// If X and Y are both non-negative (as signed values) then their sum is not
// zero unless both X and Y are zero.
- if (XKnownNonNegative && YKnownNonNegative)
+ if (XKnown.isNonNegative() && YKnown.isNonNegative())
if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q))
return true;
// If X and Y are both negative (as signed values) then their sum is not
// zero unless both X and Y equal INT_MIN.
- if (XKnownNegative && YKnownNegative) {
- KnownBits Known(BitWidth);
+ if (XKnown.isNegative() && YKnown.isNegative()) {
APInt Mask = APInt::getSignedMaxValue(BitWidth);
// The sign bit of X is set. If some other bit is set then X is not equal
// to INT_MIN.
- computeKnownBits(X, Known, Depth, Q);
- if (Known.One.intersects(Mask))
+ if (XKnown.One.intersects(Mask))
return true;
// The sign bit of Y is set. If some other bit is set then Y is not equal
// to INT_MIN.
- computeKnownBits(Y, Known, Depth, Q);
- if (Known.One.intersects(Mask))
+ if (YKnown.One.intersects(Mask))
return true;
}
// The sum of a non-negative number and a power of two is not zero.
- if (XKnownNonNegative &&
+ if (XKnown.isNonNegative() &&
isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q))
return true;
- if (YKnownNonNegative &&
+ if (YKnown.isNonNegative() &&
isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q))
return true;
}
@@ -2276,14 +2274,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
// If we know that the sign bit is either zero or one, determine the number of
// identical bits in the top of the input value.
- if (Known.isNonNegative())
- return std::max(FirstAnswer, Known.Zero.countLeadingOnes());
-
- if (Known.isNegative())
- return std::max(FirstAnswer, Known.One.countLeadingOnes());
-
- // computeKnownBits gave us no extra information about the top bits.
- return FirstAnswer;
+ return std::max(FirstAnswer, Known.countMinSignBits());
}
/// This function computes the integer multiple of Base that equals V.
@@ -3441,8 +3432,8 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
computeKnownBits(RHS, RHSKnown, DL, /*Depth=*/0, AC, CxtI, DT);
// Note that underestimating the number of zero bits gives a more
// conservative answer.
- unsigned ZeroBits = LHSKnown.Zero.countLeadingOnes() +
- RHSKnown.Zero.countLeadingOnes();
+ unsigned ZeroBits = LHSKnown.countMinLeadingZeros() +
+ RHSKnown.countMinLeadingZeros();
// First handle the easy case: if we have enough zero bits there's
// definitely no overflow.
if (ZeroBits >= BitWidth)
@@ -3475,21 +3466,17 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
AssumptionCache *AC,
const Instruction *CxtI,
const DominatorTree *DT) {
- bool LHSKnownNonNegative, LHSKnownNegative;
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
- AC, CxtI, DT);
- if (LHSKnownNonNegative || LHSKnownNegative) {
- bool RHSKnownNonNegative, RHSKnownNegative;
- ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
- AC, CxtI, DT);
-
- if (LHSKnownNegative && RHSKnownNegative) {
+ KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+ if (LHSKnown.isNonNegative() || LHSKnown.isNegative()) {
+ KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
+
+ if (LHSKnown.isNegative() && RHSKnown.isNegative()) {
// The sign bit is set in both cases: this MUST overflow.
// Create a simple add instruction, and insert it into the struct.
return OverflowResult::AlwaysOverflows;
}
- if (LHSKnownNonNegative && RHSKnownNonNegative) {
+ if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) {
// The sign bit is clear in both cases: this CANNOT overflow.
// Create a simple add instruction, and insert it into the struct.
return OverflowResult::NeverOverflows;
@@ -3499,6 +3486,51 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
return OverflowResult::MayOverflow;
}
+/// \brief Return true if we can prove that adding the two values of the
+/// knownbits will not overflow.
+/// Otherwise return false.
+static bool checkRippleForSignedAdd(const KnownBits &LHSKnown,
+ const KnownBits &RHSKnown) {
+ // Addition of two 2's complement numbers having opposite signs will never
+ // overflow.
+ if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
+ (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
+ return true;
+
+ // If either of the values is known to be non-negative, adding them can only
+ // overflow if the second is also non-negative, so we can assume that.
+ // Two non-negative numbers will only overflow if there is a carry to the
+ // sign bit, so we can check if even when the values are as big as possible
+ // there is no overflow to the sign bit.
+ if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
+ APInt MaxLHS = ~LHSKnown.Zero;
+ MaxLHS.clearSignBit();
+ APInt MaxRHS = ~RHSKnown.Zero;
+ MaxRHS.clearSignBit();
+ APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
+ return Result.isSignBitClear();
+ }
+
+ // If either of the values is known to be negative, adding them can only
+ // overflow if the second is also negative, so we can assume that.
+ // Two negative number will only overflow if there is no carry to the sign
+ // bit, so we can check if even when the values are as small as possible
+ // there is overflow to the sign bit.
+ if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
+ APInt MinLHS = LHSKnown.One;
+ MinLHS.clearSignBit();
+ APInt MinRHS = RHSKnown.One;
+ MinRHS.clearSignBit();
+ APInt Result = std::move(MinLHS) + std::move(MinRHS);
+ return Result.isSignBitSet();
+ }
+
+ // If we reached here it means that we know nothing about the sign bits.
+ // In this case we can't know if there will be an overflow, since by
+ // changing the sign bits any two values can be made to overflow.
+ return false;
+}
+
static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
const Value *RHS,
const AddOperator *Add,
@@ -3510,18 +3542,29 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
return OverflowResult::NeverOverflows;
}
- bool LHSKnownNonNegative, LHSKnownNegative;
- bool RHSKnownNonNegative, RHSKnownNegative;
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
- AC, CxtI, DT);
- ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
- AC, CxtI, DT);
+ // If LHS and RHS each have at least two sign bits, the addition will look
+ // like
+ //
+ // XX..... +
+ // YY.....
+ //
+ // If the carry into the most significant position is 0, X and Y can't both
+ // be 1 and therefore the carry out of the addition is also 0.
+ //
+ // If the carry into the most significant position is 1, X and Y can't both
+ // be 0 and therefore the carry out of the addition is also 1.
+ //
+ // Since the carry into the most significant position is always equal to
+ // the carry out of the addition, there is no signed overflow.
+ if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
+ ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
+ return OverflowResult::NeverOverflows;
+
+ KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+ KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
- if ((LHSKnownNonNegative && RHSKnownNegative) ||
- (LHSKnownNegative && RHSKnownNonNegative)) {
- // The sign bits are opposite: this CANNOT overflow.
+ if (checkRippleForSignedAdd(LHSKnown, RHSKnown))
return OverflowResult::NeverOverflows;
- }
// The remaining code needs Add to be available. Early returns if not so.
if (!Add)
@@ -3532,14 +3575,13 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
// @llvm.assume'ed non-negative rather than proved so from analyzing its
// operands.
bool LHSOrRHSKnownNonNegative =
- (LHSKnownNonNegative || RHSKnownNonNegative);
- bool LHSOrRHSKnownNegative = (LHSKnownNegative || RHSKnownNegative);
+ (LHSKnown.isNonNegative() || RHSKnown.isNonNegative());
+ bool LHSOrRHSKnownNegative =
+ (LHSKnown.isNegative() || RHSKnown.isNegative());
if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) {
- bool AddKnownNonNegative, AddKnownNegative;
- ComputeSignBit(Add, AddKnownNonNegative, AddKnownNegative, DL,
- /*Depth=*/0, AC, CxtI, DT);
- if ((AddKnownNonNegative && LHSOrRHSKnownNonNegative) ||
- (AddKnownNegative && LHSOrRHSKnownNegative)) {
+ KnownBits AddKnown = computeKnownBits(Add, DL, /*Depth=*/0, AC, CxtI, DT);
+ if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) ||
+ (AddKnown.isNegative() && LHSOrRHSKnownNegative)) {
return OverflowResult::NeverOverflows;
}
}
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 722f17a8067e..2d2249da4e13 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
using namespace llvm;
using namespace llvm::PatternMatch;
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 97a567565b47..d7602c83435c 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -162,6 +162,10 @@ bool LLParser::ValidateEndOfModule() {
AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
AttributeSet::get(Context, FnAttrs));
II->setAttributes(AS);
+ } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
+ AttrBuilder Attrs(GV->getAttributes());
+ Attrs.merge(B);
+ GV->setAttributes(AttributeSet::get(Context,Attrs));
} else {
llvm_unreachable("invalid object with forward attribute group reference");
}
@@ -832,10 +836,10 @@ bool LLParser::parseIndirectSymbol(
/// ParseGlobal
/// ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalDLLStorageClass
/// OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
-/// OptionalExternallyInitialized GlobalType Type Const
+/// OptionalExternallyInitialized GlobalType Type Const OptionalAttrs
/// ::= OptionalLinkage OptionalVisibility OptionalDLLStorageClass
/// OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
-/// OptionalExternallyInitialized GlobalType Type Const
+/// OptionalExternallyInitialized GlobalType Type Const OptionalAttrs
///
/// Everything up to and including OptionalUnnamedAddr has been parsed
/// already.
@@ -950,6 +954,16 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
}
}
+ AttrBuilder Attrs;
+ LocTy BuiltinLoc;
+ std::vector<unsigned> FwdRefAttrGrps;
+ if (ParseFnAttributeValuePairs(Attrs, FwdRefAttrGrps, false, BuiltinLoc))
+ return true;
+ if (Attrs.hasAttributes() || !FwdRefAttrGrps.empty()) {
+ GV->setAttributes(AttributeSet::get(Context, Attrs));
+ ForwardRefAttrGroups[GV] = FwdRefAttrGrps;
+ }
+
return false;
}
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 580261a3b5e0..76298121566a 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -93,13 +93,6 @@ static cl::opt<bool> PrintSummaryGUIDs(
cl::desc(
"Print the global id for each value when reading the module summary"));
-// FIXME: This flag should either be removed or moved to clang as a driver flag.
-static llvm::cl::opt<bool> IgnoreEmptyThinLTOIndexFile(
- "ignore-empty-index-file", llvm::cl::ZeroOrMore,
- llvm::cl::desc(
- "Ignore an empty index file and perform non-ThinLTO compilation"),
- llvm::cl::init(false));
-
namespace {
enum {
@@ -2750,7 +2743,7 @@ Error BitcodeReader::parseComdatRecord(ArrayRef<uint64_t> Record) {
Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
// v1: [pointer type, isconst, initid, linkage, alignment, section,
// visibility, threadlocal, unnamed_addr, externally_initialized,
- // dllstorageclass, comdat] (name in VST)
+ // dllstorageclass, comdat, attributes] (name in VST)
// v2: [strtab_offset, strtab_size, v1]
StringRef Name;
std::tie(Name, Record) = readNameFromStrtab(Record);
@@ -2830,6 +2823,11 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
} else if (hasImplicitComdat(RawLinkage)) {
NewGV->setComdat(reinterpret_cast<Comdat *>(1));
}
+
+ if (Record.size() > 12) {
+ auto AS = getAttributes(Record[12]).getFnAttributes();
+ NewGV->setAttributes(AS);
+ }
return Error::success();
}
@@ -5658,7 +5656,8 @@ Expected<bool> llvm::hasGlobalValueSummary(MemoryBufferRef Buffer) {
}
Expected<std::unique_ptr<ModuleSummaryIndex>>
-llvm::getModuleSummaryIndexForFile(StringRef Path) {
+llvm::getModuleSummaryIndexForFile(StringRef Path,
+ bool IgnoreEmptyThinLTOIndexFile) {
ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
MemoryBuffer::getFileOrSTDIN(Path);
if (!FileOrErr)
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index 42135e5949ce..d80e1da911ca 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -500,7 +500,7 @@ class MetadataLoader::MetadataLoaderImpl {
// Upgrade variables attached to globals.
for (auto &GV : TheModule.globals()) {
- SmallVector<MDNode *, 1> MDs, NewMDs;
+ SmallVector<MDNode *, 1> MDs;
GV.getMetadata(LLVMContext::MD_dbg, MDs);
GV.eraseMetadata(LLVMContext::MD_dbg);
for (auto *MD : MDs)
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1b8d81a60201..1f8b50342c2d 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1109,7 +1109,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
// GLOBALVAR: [strtab offset, strtab size, type, isconst, initid,
// linkage, alignment, section, visibility, threadlocal,
// unnamed_addr, externally_initialized, dllstorageclass,
- // comdat]
+ // comdat, attributes]
Vals.push_back(StrtabBuilder.add(GV.getName()));
Vals.push_back(GV.getName().size());
Vals.push_back(VE.getTypeID(GV.getValueType()));
@@ -1124,13 +1124,17 @@ void ModuleBitcodeWriter::writeModuleInfo() {
GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None ||
GV.isExternallyInitialized() ||
GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass ||
- GV.hasComdat()) {
+ GV.hasComdat() ||
+ GV.hasAttributes()) {
Vals.push_back(getEncodedVisibility(GV));
Vals.push_back(getEncodedThreadLocalMode(GV));
Vals.push_back(getEncodedUnnamedAddr(GV));
Vals.push_back(GV.isExternallyInitialized());
Vals.push_back(getEncodedDLLStorageClass(GV));
Vals.push_back(GV.hasComdat() ? VE.getComdatID(GV.getComdat()) : 0);
+
+ auto AL = GV.getAttributesAsList(AttributeList::FunctionIndex);
+ Vals.push_back(VE.getAttributeListID(AL));
} else {
AbbrevToUse = SimpleGVarAbbrev;
}
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 861150766986..fd76400331d9 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -314,10 +314,13 @@ ValueEnumerator::ValueEnumerator(const Module &M,
// Remember what is the cutoff between globalvalue's and other constants.
unsigned FirstConstant = Values.size();
- // Enumerate the global variable initializers.
- for (const GlobalVariable &GV : M.globals())
+ // Enumerate the global variable initializers and attributes.
+ for (const GlobalVariable &GV : M.globals()) {
if (GV.hasInitializer())
EnumerateValue(GV.getInitializer());
+ if (GV.hasAttributes())
+ EnumerateAttributes(GV.getAttributesAsList(AttributeList::FunctionIndex));
+ }
// Enumerate the aliasees.
for (const GlobalAlias &GA : M.aliases())
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 76549540ce0f..73fc2b35fe4e 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -21,5 +21,5 @@ add_subdirectory(LineEditor)
add_subdirectory(ProfileData)
add_subdirectory(Fuzzer)
add_subdirectory(Passes)
-add_subdirectory(LibDriver)
+add_subdirectory(ToolDrivers)
add_subdirectory(XRay)
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 87b45c001de4..98163bffb60b 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -767,7 +767,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
// If our DISubprogram name is empty, use the mangled name.
if (FuncName.empty())
- FuncName = GlobalValue::getRealLinkageName(GV->getName());
+ FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName());
// Emit a symbol subsection, required by VS2012+ to find function boundaries.
OS.AddComment("Symbol subsection for " + Twine(FuncName));
@@ -888,13 +888,21 @@ void CodeViewDebug::collectVariableInfoFromMFTable(
if (!Scope)
continue;
+ // If the variable has an attached offset expression, extract it.
+ // FIXME: Try to handle DW_OP_deref as well.
+ int64_t ExprOffset = 0;
+ if (VI.Expr)
+ if (!VI.Expr->extractIfOffset(ExprOffset))
+ continue;
+
// Get the frame register used and the offset.
unsigned FrameReg = 0;
int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg);
uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg);
// Calculate the label ranges.
- LocalVarDefRange DefRange = createDefRangeMem(CVReg, FrameOffset);
+ LocalVarDefRange DefRange =
+ createDefRangeMem(CVReg, FrameOffset + ExprOffset);
for (const InsnRange &Range : Scope->getRanges()) {
const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
const MCSymbol *End = getLabelAfterInsn(Range.second);
@@ -2194,7 +2202,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() {
if (GV->hasComdat()) {
MCSymbol *GVSym = Asm->getSymbol(GV);
OS.AddComment("Symbol subsection for " +
- Twine(GlobalValue::getRealLinkageName(GV->getName())));
+ Twine(GlobalValue::dropLLVMManglingEscape(GV->getName())));
switchToDebugSectionForSymbol(GVSym);
EndLabel = beginCVSubsection(ModuleDebugFragmentKind::Symbols);
// FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions.
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 1d63e33a4d33..826162ad47c4 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -129,10 +129,9 @@ bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) {
}
void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
- assert(Asm);
PrevInstBB = nullptr;
- if (!hasDebugInfo(MMI, MF)) {
+ if (!Asm || !hasDebugInfo(MMI, MF)) {
skippedNonDebugFunction();
return;
}
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 738e062cb93f..e172712cf889 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -440,7 +440,7 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) {
auto *InlinedSP = getDISubprogram(DS);
// Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
// was inlined from another compile unit.
- DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP];
+ DIE *OriginDIE = getAbstractSPDies()[InlinedSP];
assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine);
@@ -634,7 +634,7 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
LexicalScope *Scope) {
- DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()];
+ DIE *&AbsDef = getAbstractSPDies()[Scope->getScopeNode()];
if (AbsDef)
return;
@@ -696,7 +696,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
DIE *D = getDIE(SP);
- if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) {
+ if (DIE *AbsSPDIE = getAbstractSPDies().lookup(SP)) {
if (D)
// If this subprogram has an abstract definition, reference that
addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
@@ -708,6 +708,42 @@ void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
}
}
+void DwarfCompileUnit::finishVariableDefinition(const DbgVariable &Var) {
+ DbgVariable *AbsVar = getExistingAbstractVariable(
+ InlinedVariable(Var.getVariable(), Var.getInlinedAt()));
+ auto *VariableDie = Var.getDIE();
+ if (AbsVar && AbsVar->getDIE()) {
+ addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
+ *AbsVar->getDIE());
+ } else
+ applyVariableAttributes(Var, *VariableDie);
+}
+
+DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(InlinedVariable IV) {
+ const DILocalVariable *Cleansed;
+ return getExistingAbstractVariable(IV, Cleansed);
+}
+
+// Find abstract variable, if any, associated with Var.
+DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(
+ InlinedVariable IV, const DILocalVariable *&Cleansed) {
+ // More then one inlined variable corresponds to one abstract variable.
+ Cleansed = IV.first;
+ auto &AbstractVariables = getAbstractVariables();
+ auto I = AbstractVariables.find(Cleansed);
+ if (I != AbstractVariables.end())
+ return I->second.get();
+ return nullptr;
+}
+
+void DwarfCompileUnit::createAbstractVariable(const DILocalVariable *Var,
+ LexicalScope *Scope) {
+ assert(Scope && Scope->isAbstractScope());
+ auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
+ DU->addScopeVariable(Scope, AbsDbgVariable.get());
+ getAbstractVariables()[Var] = std::move(AbsDbgVariable);
+}
+
void DwarfCompileUnit::emitHeader(bool UseOffsets) {
// Don't bother labeling the .dwo unit, as its offset isn't used.
if (!Skeleton) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 20a415150b4d..77e9e671529f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -68,6 +68,9 @@ class DwarfCompileUnit final : public DwarfUnit {
// ranges/locs.
const MCSymbol *BaseAddress;
+ DenseMap<const MDNode *, DIE *> AbstractSPDies;
+ DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
+
/// \brief Construct a DIE for the given DbgVariable without initializing the
/// DbgVariable's DIE reference.
DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract);
@@ -76,6 +79,18 @@ class DwarfCompileUnit final : public DwarfUnit {
bool includeMinimalInlineScopes() const;
+ DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
+ if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+ return AbstractSPDies;
+ return DU->getAbstractSPDies();
+ }
+
+ DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() {
+ if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+ return AbstractVariables;
+ return DU->getAbstractVariables();
+ }
+
public:
DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
DwarfDebug *DW, DwarfFile *DWU);
@@ -189,6 +204,13 @@ public:
DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
void finishSubprogramDefinition(const DISubprogram *SP);
+ void finishVariableDefinition(const DbgVariable &Var);
+ /// Find abstract variable associated with Var.
+ typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
+ DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
+ const DILocalVariable *&Cleansed);
+ DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
+ void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
/// Set the skeleton unit associated with this unit.
void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 6f442f5c3172..3410b98d7776 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -71,6 +71,10 @@ static cl::opt<bool> GenerateARangeSection("generate-arange-section",
cl::desc("Generate dwarf aranges"),
cl::init(false));
+static cl::opt<bool> SplitDwarfCrossCuReferences(
+ "split-dwarf-cross-cu-references", cl::Hidden,
+ cl::desc("Enable cross-cu references in DWO files"), cl::init(false));
+
namespace {
enum DefaultOnOff { Default, Enable, Disable };
}
@@ -362,21 +366,29 @@ template <typename Func> static void forBothCUs(DwarfCompileUnit &CU, Func F) {
F(*SkelCU);
}
-void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
+bool DwarfDebug::shareAcrossDWOCUs() const {
+ return SplitDwarfCrossCuReferences;
+}
+
+void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
+ LexicalScope *Scope) {
assert(Scope && Scope->getScopeNode());
assert(Scope->isAbstractScope());
assert(!Scope->getInlinedAt());
auto *SP = cast<DISubprogram>(Scope->getScopeNode());
- ProcessedSPNodes.insert(SP);
-
// Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
// was inlined from another compile unit.
auto &CU = *CUMap.lookup(SP->getUnit());
- forBothCUs(CU, [&](DwarfCompileUnit &CU) {
+ if (auto *SkelCU = CU.getSkeleton()) {
+ (shareAcrossDWOCUs() ? CU : SrcCU)
+ .constructAbstractSubprogramScopeDIE(Scope);
+ if (CU.getCUNode()->getSplitDebugInlining())
+ SkelCU->constructAbstractSubprogramScopeDIE(Scope);
+ } else {
CU.constructAbstractSubprogramScopeDIE(Scope);
- });
+ }
}
void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
@@ -564,13 +576,7 @@ void DwarfDebug::finishVariableDefinitions() {
// DIE::getUnit isn't simple - it walks parent pointers, etc.
DwarfCompileUnit *Unit = CUDieMap.lookup(VariableDie->getUnitDie());
assert(Unit);
- DbgVariable *AbsVar = getExistingAbstractVariable(
- InlinedVariable(Var->getVariable(), Var->getInlinedAt()));
- if (AbsVar && AbsVar->getDIE()) {
- Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
- *AbsVar->getDIE());
- } else
- Unit->applyVariableAttributes(*Var, *VariableDie);
+ Unit->finishVariableDefinition(*Var);
}
}
@@ -718,58 +724,32 @@ void DwarfDebug::endModule() {
}
// clean up.
- AbstractVariables.clear();
+ // FIXME: AbstractVariables.clear();
}
-// Find abstract variable, if any, associated with Var.
-DbgVariable *
-DwarfDebug::getExistingAbstractVariable(InlinedVariable IV,
- const DILocalVariable *&Cleansed) {
- // More then one inlined variable corresponds to one abstract variable.
- Cleansed = IV.first;
- auto I = AbstractVariables.find(Cleansed);
- if (I != AbstractVariables.end())
- return I->second.get();
- return nullptr;
-}
-
-DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) {
- const DILocalVariable *Cleansed;
- return getExistingAbstractVariable(IV, Cleansed);
-}
-
-void DwarfDebug::createAbstractVariable(const DILocalVariable *Var,
- LexicalScope *Scope) {
- assert(Scope && Scope->isAbstractScope());
- auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
- InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
- AbstractVariables[Var] = std::move(AbsDbgVariable);
-}
-
-void DwarfDebug::ensureAbstractVariableIsCreated(InlinedVariable IV,
+void DwarfDebug::ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable IV,
const MDNode *ScopeNode) {
const DILocalVariable *Cleansed = nullptr;
- if (getExistingAbstractVariable(IV, Cleansed))
+ if (CU.getExistingAbstractVariable(IV, Cleansed))
return;
- createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
+ CU.createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
cast<DILocalScope>(ScopeNode)));
}
-void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(
+void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU,
InlinedVariable IV, const MDNode *ScopeNode) {
const DILocalVariable *Cleansed = nullptr;
- if (getExistingAbstractVariable(IV, Cleansed))
+ if (CU.getExistingAbstractVariable(IV, Cleansed))
return;
if (LexicalScope *Scope =
LScopes.findAbstractScope(cast_or_null<DILocalScope>(ScopeNode)))
- createAbstractVariable(Cleansed, Scope);
+ CU.createAbstractVariable(Cleansed, Scope);
}
-
// Collect variable information from side table maintained by MF.
void DwarfDebug::collectVariableInfoFromMFTable(
- DenseSet<InlinedVariable> &Processed) {
+ DwarfCompileUnit &TheCU, DenseSet<InlinedVariable> &Processed) {
for (const auto &VI : Asm->MF->getVariableDbgInfo()) {
if (!VI.Var)
continue;
@@ -784,7 +764,7 @@ void DwarfDebug::collectVariableInfoFromMFTable(
if (!Scope)
continue;
- ensureAbstractVariableIsCreatedIfScoped(Var, Scope->getScopeNode());
+ ensureAbstractVariableIsCreatedIfScoped(TheCU, Var, Scope->getScopeNode());
auto RegVar = make_unique<DbgVariable>(Var.first, Var.second);
RegVar->initializeMMI(VI.Expr, VI.Slot);
if (InfoHolder.addScopeVariable(Scope, RegVar.get()))
@@ -955,9 +935,10 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
}
}
-DbgVariable *DwarfDebug::createConcreteVariable(LexicalScope &Scope,
+DbgVariable *DwarfDebug::createConcreteVariable(DwarfCompileUnit &TheCU,
+ LexicalScope &Scope,
InlinedVariable IV) {
- ensureAbstractVariableIsCreatedIfScoped(IV, Scope.getScopeNode());
+ ensureAbstractVariableIsCreatedIfScoped(TheCU, IV, Scope.getScopeNode());
ConcreteVariables.push_back(make_unique<DbgVariable>(IV.first, IV.second));
InfoHolder.addScopeVariable(&Scope, ConcreteVariables.back().get());
return ConcreteVariables.back().get();
@@ -980,7 +961,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
const DISubprogram *SP,
DenseSet<InlinedVariable> &Processed) {
// Grab the variable info that was squirreled away in the MMI side-table.
- collectVariableInfoFromMFTable(Processed);
+ collectVariableInfoFromMFTable(TheCU, Processed);
for (const auto &I : DbgValues) {
InlinedVariable IV = I.first;
@@ -1002,7 +983,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
continue;
Processed.insert(IV);
- DbgVariable *RegVar = createConcreteVariable(*Scope, IV);
+ DbgVariable *RegVar = createConcreteVariable(TheCU, *Scope, IV);
const MachineInstr *MInsn = Ranges.front().first;
assert(MInsn->isDebugValue() && "History must begin with debug value");
@@ -1038,7 +1019,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
for (const DILocalVariable *DV : SP->getVariables()) {
if (Processed.insert(InlinedVariable(DV, nullptr)).second)
if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope()))
- createConcreteVariable(*Scope, InlinedVariable(DV, nullptr));
+ createConcreteVariable(TheCU, *Scope, InlinedVariable(DV, nullptr));
}
}
@@ -1229,12 +1210,12 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
for (const DILocalVariable *DV : SP->getVariables()) {
if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second)
continue;
- ensureAbstractVariableIsCreated(InlinedVariable(DV, nullptr),
+ ensureAbstractVariableIsCreated(TheCU, InlinedVariable(DV, nullptr),
DV->getScope());
assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes
&& "ensureAbstractVariableIsCreated inserted abstract scopes");
}
- constructAbstractSubprogramScopeDIE(AScope);
+ constructAbstractSubprogramScopeDIE(TheCU, AScope);
}
ProcessedSPNodes.insert(SP);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 8a96e7867b6e..b9c5aa9ffb23 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -210,7 +210,6 @@ class DwarfDebug : public DebugHandlerBase {
DenseMap<const MCSymbol *, uint64_t> SymSize;
/// Collection of abstract variables.
- DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
/// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
@@ -313,20 +312,16 @@ class DwarfDebug : public DebugHandlerBase {
typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
- /// Find abstract variable associated with Var.
- DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
- const DILocalVariable *&Cleansed);
- DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
- void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
- void ensureAbstractVariableIsCreated(InlinedVariable Var,
+ void ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable Var,
const MDNode *Scope);
- void ensureAbstractVariableIsCreatedIfScoped(InlinedVariable Var,
+ void ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable Var,
const MDNode *Scope);
- DbgVariable *createConcreteVariable(LexicalScope &Scope, InlinedVariable IV);
+ DbgVariable *createConcreteVariable(DwarfCompileUnit &TheCU,
+ LexicalScope &Scope, InlinedVariable IV);
/// Construct a DIE for this abstract scope.
- void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
+ void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope);
void finishVariableDefinitions();
@@ -446,7 +441,8 @@ class DwarfDebug : public DebugHandlerBase {
const DbgValueHistoryMap::InstrRanges &Ranges);
/// Collect variable information from the side table maintained by MF.
- void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P);
+ void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU,
+ DenseSet<InlinedVariable> &P);
protected:
/// Gather pre-function debug information.
@@ -518,6 +514,8 @@ public:
/// split dwarf proposal support.
bool useSplitDwarf() const { return HasSplitDwarf; }
+ bool shareAcrossDWOCUs() const;
+
/// Returns the Dwarf Version.
uint16_t getDwarfVersion() const;
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index d4d2ed277274..54924e9806ed 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -53,6 +53,7 @@ class DwarfFile {
// Collection of abstract subprogram DIEs.
DenseMap<const MDNode *, DIE *> AbstractSPDies;
+ DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
/// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
/// be shared across CUs, that is why we keep the map here instead
@@ -105,6 +106,9 @@ public:
DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
return AbstractSPDies;
}
+ DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() {
+ return AbstractVariables;
+ }
void insertDIE(const MDNode *TypeMD, DIE *Die) {
DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 8d25def7772c..667afbb450bd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -173,7 +173,7 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
}
/// Check whether the DIE for this MDNode can be shared across CUs.
-static bool isShareableAcrossCUs(const DINode *D) {
+bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const {
// When the MDNode can be part of the type system, the DIE can be shared
// across CUs.
// Combining type units and cross-CU DIE sharing is lower value (since
@@ -181,6 +181,8 @@ static bool isShareableAcrossCUs(const DINode *D) {
// level already) but may be implementable for some value in projects
// building multiple independent libraries with LTO and then linking those
// together.
+ if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+ return false;
return (isa<DIType>(D) ||
(isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) &&
!GenerateDwarfTypeUnits;
@@ -645,7 +647,7 @@ void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
addString(Die,
DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
: dwarf::DW_AT_MIPS_linkage_name,
- GlobalValue::getRealLinkageName(LinkageName));
+ GlobalValue::dropLLVMManglingEscape(LinkageName));
}
void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 8fc841703e23..7acad2cbd89f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -65,7 +65,7 @@ public:
//===----------------------------------------------------------------------===//
/// This dwarf writer support class manages information associated with a
/// source file.
- class DwarfUnit : public DIEUnit {
+class DwarfUnit : public DIEUnit {
protected:
/// MDNode for the compile unit.
const DICompileUnit *CUNode;
@@ -103,6 +103,9 @@ protected:
bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie);
+ bool shareAcrossDWOCUs() const;
+ bool isShareableAcrossCUs(const DINode *D) const;
+
public:
// Accessors.
AsmPrinter* getAsmPrinter() const { return Asm; }
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 704f0ac2f191..815658bfb637 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -101,7 +101,7 @@ void WinException::beginFunction(const MachineFunction *MF) {
// functions may still refer to it.
const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
StringRef FLinkageName =
- GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+ GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName());
emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName);
}
shouldEmitLSDA = hasEHFunclets;
@@ -174,7 +174,7 @@ static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm,
// their funclet entry block's number.
const MachineFunction *MF = MBB->getParent();
const Function *F = MF->getFunction();
- StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+ StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
MCContext &Ctx = MF->getContext();
StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch";
return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" +
@@ -252,7 +252,7 @@ void WinException::endFunclet() {
!CurrentFuncletEntry->isCleanupFuncletEntry()) {
// If this is a C++ catch funclet (or the parent function),
// emit a reference to the LSDA for the parent function.
- StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+ StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
Twine("$cppxdata$", FuncLinkageName));
Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4);
@@ -536,7 +536,7 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
// Emit a label assignment with the SEH frame offset so we can use it for
// llvm.x86.seh.recoverfp.
StringRef FLinkageName =
- GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+ GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName());
MCSymbol *ParentFrameOffset =
Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
const MCExpr *MCOffset =
@@ -635,7 +635,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
auto &OS = *Asm->OutStreamer;
const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
- StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+ StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable;
MCSymbol *FuncInfoXData = nullptr;
@@ -942,7 +942,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
MCStreamer &OS = *Asm->OutStreamer;
const Function *F = MF->getFunction();
- StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName());
+ StringRef FLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
bool VerboseAsm = OS.isVerboseAsm();
auto AddComment = [&](const Twine &Comment) {
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index 9c19a4fd3c3e..17e6be05eb42 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -47,8 +47,7 @@ namespace {
bool runOnFunction(Function &F) override;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
- bool IsStore, bool IsLoad);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -224,22 +223,16 @@ bool AtomicExpand::runOnFunction(Function &F) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
- bool IsStore, IsLoad;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
- IsStore = false;
- IsLoad = true;
} else if (SI && isReleaseOrStronger(SI->getOrdering())) {
FenceOrdering = SI->getOrdering();
SI->setOrdering(AtomicOrdering::Monotonic);
- IsStore = true;
- IsLoad = false;
} else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
isAcquireOrStronger(RMWI->getOrdering()))) {
FenceOrdering = RMWI->getOrdering();
RMWI->setOrdering(AtomicOrdering::Monotonic);
- IsStore = IsLoad = true;
} else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
(isReleaseOrStronger(CASI->getSuccessOrdering()) ||
isAcquireOrStronger(CASI->getSuccessOrdering()))) {
@@ -250,11 +243,10 @@ bool AtomicExpand::runOnFunction(Function &F) {
FenceOrdering = CASI->getSuccessOrdering();
CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
CASI->setFailureOrdering(AtomicOrdering::Monotonic);
- IsStore = IsLoad = true;
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering);
}
}
@@ -320,13 +312,12 @@ bool AtomicExpand::runOnFunction(Function &F) {
return MadeChange;
}
-bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
- bool IsStore, bool IsLoad) {
+bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
IRBuilder<> Builder(I);
- auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
- auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
// The trailing fence is emitted before the instruction instead of after
// because there is no easy way of setting Builder insertion point after
// an instruction. So we must erase it from the BB, and insert it back
@@ -1048,8 +1039,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
std::prev(BB->end())->eraseFromParent();
Builder.SetInsertPoint(BB);
if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
- TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
- /*IsLoad=*/true);
+ TLI->emitLeadingFence(Builder, CI, SuccessOrder);
Builder.CreateBr(StartBB);
// Start the main loop block now that we've taken care of the preliminaries.
@@ -1064,8 +1054,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
Builder.SetInsertPoint(ReleasingStoreBB);
if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
- TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
- /*IsLoad=*/true);
+ TLI->emitLeadingFence(Builder, CI, SuccessOrder);
Builder.CreateBr(TryStoreBB);
Builder.SetInsertPoint(TryStoreBB);
@@ -1094,8 +1083,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
// necessary.
Builder.SetInsertPoint(SuccessBB);
if (ShouldInsertFencesForAtomic)
- TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
- /*IsLoad=*/true);
+ TLI->emitTrailingFence(Builder, CI, SuccessOrder);
Builder.CreateBr(ExitBB);
Builder.SetInsertPoint(NoStoreBB);
@@ -1107,8 +1095,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
Builder.SetInsertPoint(FailureBB);
if (ShouldInsertFencesForAtomic)
- TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
- /*IsLoad=*/true);
+ TLI->emitTrailingFence(Builder, CI, FailureOrder);
Builder.CreateBr(ExitBB);
// Finally, we have control-flow based knowledge of whether the cmpxchg
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 26da748fa244..55a27e2fb79e 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_library(LLVMCodeGen
ExecutionDepsFix.cpp
ExpandISelPseudos.cpp
ExpandPostRAPseudos.cpp
+ ExpandReductions.cpp
FaultMaps.cpp
FEntryInserter.cpp
FuncletLayout.cpp
@@ -48,6 +49,7 @@ add_llvm_library(LLVMCodeGen
LivePhysRegs.cpp
LiveRangeCalc.cpp
LiveRangeEdit.cpp
+ LiveRangeShrink.cpp
LiveRegMatrix.cpp
LiveRegUnits.cpp
LiveStackAnalysis.cpp
@@ -118,6 +120,7 @@ add_llvm_library(LLVMCodeGen
SafeStack.cpp
SafeStackColoring.cpp
SafeStackLayout.cpp
+ ScalarizeMaskedMemIntrin.cpp
ScheduleDAG.cpp
ScheduleDAGInstrs.cpp
ScheduleDAGPrinter.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 3fc12ccc3b60..4d30c6574b12 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -43,6 +43,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeLiveDebugValuesPass(Registry);
initializeLiveDebugVariablesPass(Registry);
initializeLiveIntervalsPass(Registry);
+ initializeLiveRangeShrinkPass(Registry);
initializeLiveStacksPass(Registry);
initializeLiveVariablesPass(Registry);
initializeLocalStackSlotPassPass(Registry);
@@ -79,7 +80,8 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeRAGreedyPass(Registry);
initializeRegisterCoalescerPass(Registry);
initializeRenameIndependentSubregsPass(Registry);
- initializeSafeStackPass(Registry);
+ initializeSafeStackLegacyPassPass(Registry);
+ initializeScalarizeMaskedMemIntrinPass(Registry);
initializeShrinkWrapPass(Registry);
initializeSlotIndexesPass(Registry);
initializeStackColoringPass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index c6c93811a0f9..f2e024c5e3bd 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -295,7 +295,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
if (PSI->isFunctionHotInCallGraph(&F))
F.setSectionPrefix(".hot");
else if (PSI->isFunctionColdInCallGraph(&F))
- F.setSectionPrefix(".cold");
+ F.setSectionPrefix(".unlikely");
}
/// This optimization identifies DIV instructions that can be
@@ -1549,519 +1549,6 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
return MadeChange;
}
-// Translate a masked load intrinsic like
-// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
-// <16 x i1> %mask, <16 x i32> %passthru)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-// %1 = bitcast i8* %addr to i32*
-// %2 = extractelement <16 x i1> %mask, i32 0
-// %3 = icmp eq i1 %2, true
-// br i1 %3, label %cond.load, label %else
-//
-//cond.load: ; preds = %0
-// %4 = getelementptr i32* %1, i32 0
-// %5 = load i32* %4
-// %6 = insertelement <16 x i32> undef, i32 %5, i32 0
-// br label %else
-//
-//else: ; preds = %0, %cond.load
-// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
-// %7 = extractelement <16 x i1> %mask, i32 1
-// %8 = icmp eq i1 %7, true
-// br i1 %8, label %cond.load1, label %else2
-//
-//cond.load1: ; preds = %else
-// %9 = getelementptr i32* %1, i32 1
-// %10 = load i32* %9
-// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
-// br label %else2
-//
-//else2: ; preds = %else, %cond.load1
-// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-// %12 = extractelement <16 x i1> %mask, i32 2
-// %13 = icmp eq i1 %12, true
-// br i1 %13, label %cond.load4, label %else5
-//
-static void scalarizeMaskedLoad(CallInst *CI) {
- Value *Ptr = CI->getArgOperand(0);
- Value *Alignment = CI->getArgOperand(1);
- Value *Mask = CI->getArgOperand(2);
- Value *Src0 = CI->getArgOperand(3);
-
- unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
- VectorType *VecType = dyn_cast<VectorType>(CI->getType());
- assert(VecType && "Unexpected return type of masked load intrinsic");
-
- Type *EltTy = CI->getType()->getVectorElementType();
-
- IRBuilder<> Builder(CI->getContext());
- Instruction *InsertPt = CI;
- BasicBlock *IfBlock = CI->getParent();
- BasicBlock *CondBlock = nullptr;
- BasicBlock *PrevIfBlock = CI->getParent();
-
- Builder.SetInsertPoint(InsertPt);
- Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
- // Short-cut if the mask is all-true.
- bool IsAllOnesMask = isa<Constant>(Mask) &&
- cast<Constant>(Mask)->isAllOnesValue();
-
- if (IsAllOnesMask) {
- Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
- CI->replaceAllUsesWith(NewI);
- CI->eraseFromParent();
- return;
- }
-
- // Adjust alignment for the scalar instruction.
- AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8);
- // Bitcast %addr fron i8* to EltTy*
- Type *NewPtrType =
- EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
- Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
- unsigned VectorWidth = VecType->getNumElements();
-
- Value *UndefVal = UndefValue::get(VecType);
-
- // The result vector
- Value *VResult = UndefVal;
-
- if (isa<ConstantVector>(Mask)) {
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
- if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
- continue;
- Value *Gep =
- Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
- LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
- VResult = Builder.CreateInsertElement(VResult, Load,
- Builder.getInt32(Idx));
- }
- Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
- CI->replaceAllUsesWith(NewI);
- CI->eraseFromParent();
- return;
- }
-
- PHINode *Phi = nullptr;
- Value *PrevPhi = UndefVal;
-
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
- // Fill the "else" block, created in the previous iteration
- //
- // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
- // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
- // %to_load = icmp eq i1 %mask_1, true
- // br i1 %to_load, label %cond.load, label %else
- //
- if (Idx > 0) {
- Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
- Phi->addIncoming(VResult, CondBlock);
- Phi->addIncoming(PrevPhi, PrevIfBlock);
- PrevPhi = Phi;
- VResult = Phi;
- }
-
- Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
- Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
- ConstantInt::get(Predicate->getType(), 1));
-
- // Create "cond" block
- //
- // %EltAddr = getelementptr i32* %1, i32 0
- // %Elt = load i32* %EltAddr
- // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
- //
- CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
- Builder.SetInsertPoint(InsertPt);
-
- Value *Gep =
- Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
- LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
- VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
-
- // Create "else" block, fill it in the next iteration
- BasicBlock *NewIfBlock =
- CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
- Builder.SetInsertPoint(InsertPt);
- Instruction *OldBr = IfBlock->getTerminator();
- BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
- OldBr->eraseFromParent();
- PrevIfBlock = IfBlock;
- IfBlock = NewIfBlock;
- }
-
- Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
- Phi->addIncoming(VResult, CondBlock);
- Phi->addIncoming(PrevPhi, PrevIfBlock);
- Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
- CI->replaceAllUsesWith(NewI);
- CI->eraseFromParent();
-}
-
-// Translate a masked store intrinsic, like
-// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
-// <16 x i1> %mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set
-//
-// %1 = bitcast i8* %addr to i32*
-// %2 = extractelement <16 x i1> %mask, i32 0
-// %3 = icmp eq i1 %2, true
-// br i1 %3, label %cond.store, label %else
-//
-// cond.store: ; preds = %0
-// %4 = extractelement <16 x i32> %val, i32 0
-// %5 = getelementptr i32* %1, i32 0
-// store i32 %4, i32* %5
-// br label %else
-//
-// else: ; preds = %0, %cond.store
-// %6 = extractelement <16 x i1> %mask, i32 1
-// %7 = icmp eq i1 %6, true
-// br i1 %7, label %cond.store1, label %else2
-//
-// cond.store1: ; preds = %else
-// %8 = extractelement <16 x i32> %val, i32 1
-// %9 = getelementptr i32* %1, i32 1
-// store i32 %8, i32* %9
-// br label %else2
-// . . .
-static void scalarizeMaskedStore(CallInst *CI) {
- Value *Src = CI->getArgOperand(0);
- Value *Ptr = CI->getArgOperand(1);
- Value *Alignment = CI->getArgOperand(2);
- Value *Mask = CI->getArgOperand(3);
-
- unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
- VectorType *VecType = dyn_cast<VectorType>(Src->getType());
- assert(VecType && "Unexpected data type in masked store intrinsic");
-
- Type *EltTy = VecType->getElementType();
-
- IRBuilder<> Builder(CI->getContext());
- Instruction *InsertPt = CI;
- BasicBlock *IfBlock = CI->getParent();
- Builder.SetInsertPoint(InsertPt);
- Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
- // Short-cut if the mask is all-true.
- bool IsAllOnesMask = isa<Constant>(Mask) &&
- cast<Constant>(Mask)->isAllOnesValue();
-
- if (IsAllOnesMask) {
- Builder.CreateAlignedStore(Src, Ptr, AlignVal);
- CI->eraseFromParent();
- return;
- }
-
- // Adjust alignment for the scalar instruction.
- AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8);
- // Bitcast %addr fron i8* to EltTy*
- Type *NewPtrType =
- EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
- Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
- unsigned VectorWidth = VecType->getNumElements();
-
- if (isa<ConstantVector>(Mask)) {
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
- if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
- continue;
- Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
- Value *Gep =
- Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
- Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
- }
- CI->eraseFromParent();
- return;
- }
-
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
- // Fill the "else" block, created in the previous iteration
- //
- // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
- // %to_store = icmp eq i1 %mask_1, true
- // br i1 %to_store, label %cond.store, label %else
- //
- Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
- Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
- ConstantInt::get(Predicate->getType(), 1));
-
- // Create "cond" block
- //
- // %OneElt = extractelement <16 x i32> %Src, i32 Idx
- // %EltAddr = getelementptr i32* %1, i32 0
- // %store i32 %OneElt, i32* %EltAddr
- //
- BasicBlock *CondBlock =
- IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
- Builder.SetInsertPoint(InsertPt);
-
- Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
- Value *Gep =
- Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
- Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
-
- // Create "else" block, fill it in the next iteration
- BasicBlock *NewIfBlock =
- CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
- Builder.SetInsertPoint(InsertPt);
- Instruction *OldBr = IfBlock->getTerminator();
- BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
- OldBr->eraseFromParent();
- IfBlock = NewIfBlock;
- }
- CI->eraseFromParent();
-}
-
-// Translate a masked gather intrinsic like
-// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
-// <16 x i1> %Mask, <16 x i32> %Src)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
-// % Mask0 = extractelement <16 x i1> %Mask, i32 0
-// % ToLoad0 = icmp eq i1 % Mask0, true
-// br i1 % ToLoad0, label %cond.load, label %else
-//
-// cond.load:
-// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// % Load0 = load i32, i32* % Ptr0, align 4
-// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
-// br label %else
-//
-// else:
-// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
-// % Mask1 = extractelement <16 x i1> %Mask, i32 1
-// % ToLoad1 = icmp eq i1 % Mask1, true
-// br i1 % ToLoad1, label %cond.load1, label %else2
-//
-// cond.load1:
-// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// % Load1 = load i32, i32* % Ptr1, align 4
-// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
-// br label %else2
-// . . .
-// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
-// ret <16 x i32> %Result
-static void scalarizeMaskedGather(CallInst *CI) {
- Value *Ptrs = CI->getArgOperand(0);
- Value *Alignment = CI->getArgOperand(1);
- Value *Mask = CI->getArgOperand(2);
- Value *Src0 = CI->getArgOperand(3);
-
- VectorType *VecType = dyn_cast<VectorType>(CI->getType());
-
- assert(VecType && "Unexpected return type of masked load intrinsic");
-
- IRBuilder<> Builder(CI->getContext());
- Instruction *InsertPt = CI;
- BasicBlock *IfBlock = CI->getParent();
- BasicBlock *CondBlock = nullptr;
- BasicBlock *PrevIfBlock = CI->getParent();
- Builder.SetInsertPoint(InsertPt);
- unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-
- Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
- Value *UndefVal = UndefValue::get(VecType);
-
- // The result vector
- Value *VResult = UndefVal;
- unsigned VectorWidth = VecType->getNumElements();
-
- // Shorten the way if the mask is a vector of constants.
- bool IsConstMask = isa<ConstantVector>(Mask);
-
- if (IsConstMask) {
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
- if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
- continue;
- Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
- "Ptr" + Twine(Idx));
- LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
- "Load" + Twine(Idx));
- VResult = Builder.CreateInsertElement(VResult, Load,
- Builder.getInt32(Idx),
- "Res" + Twine(Idx));
- }
- Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
- CI->replaceAllUsesWith(NewI);
- CI->eraseFromParent();
- return;
- }
-
- PHINode *Phi = nullptr;
- Value *PrevPhi = UndefVal;
-
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
- // Fill the "else" block, created in the previous iteration
- //
- // %Mask1 = extractelement <16 x i1> %Mask, i32 1
- // %ToLoad1 = icmp eq i1 %Mask1, true
- // br i1 %ToLoad1, label %cond.load, label %else
- //
- if (Idx > 0) {
- Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
- Phi->addIncoming(VResult, CondBlock);
- Phi->addIncoming(PrevPhi, PrevIfBlock);
- PrevPhi = Phi;
- VResult = Phi;
- }
-
- Value *Predicate = Builder.CreateExtractElement(Mask,
- Builder.getInt32(Idx),
- "Mask" + Twine(Idx));
- Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
- ConstantInt::get(Predicate->getType(), 1),
- "ToLoad" + Twine(Idx));
-
- // Create "cond" block
- //
- // %EltAddr = getelementptr i32* %1, i32 0
- // %Elt = load i32* %EltAddr
- // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
- //
- CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
- Builder.SetInsertPoint(InsertPt);
-
- Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
- "Ptr" + Twine(Idx));
- LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
- "Load" + Twine(Idx));
- VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
- "Res" + Twine(Idx));
-
- // Create "else" block, fill it in the next iteration
- BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
- Builder.SetInsertPoint(InsertPt);
- Instruction *OldBr = IfBlock->getTerminator();
- BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
- OldBr->eraseFromParent();
- PrevIfBlock = IfBlock;
- IfBlock = NewIfBlock;
- }
-
- Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
- Phi->addIncoming(VResult, CondBlock);
- Phi->addIncoming(PrevPhi, PrevIfBlock);
- Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
- CI->replaceAllUsesWith(NewI);
- CI->eraseFromParent();
-}
-
-// Translate a masked scatter intrinsic, like
-// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
-// <16 x i1> %Mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set.
-//
-// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
-// % Mask0 = extractelement <16 x i1> % Mask, i32 0
-// % ToStore0 = icmp eq i1 % Mask0, true
-// br i1 %ToStore0, label %cond.store, label %else
-//
-// cond.store:
-// % Elt0 = extractelement <16 x i32> %Src, i32 0
-// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// store i32 %Elt0, i32* % Ptr0, align 4
-// br label %else
-//
-// else:
-// % Mask1 = extractelement <16 x i1> % Mask, i32 1
-// % ToStore1 = icmp eq i1 % Mask1, true
-// br i1 % ToStore1, label %cond.store1, label %else2
-//
-// cond.store1:
-// % Elt1 = extractelement <16 x i32> %Src, i32 1
-// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// store i32 % Elt1, i32* % Ptr1, align 4
-// br label %else2
-// . . .
-static void scalarizeMaskedScatter(CallInst *CI) {
- Value *Src = CI->getArgOperand(0);
- Value *Ptrs = CI->getArgOperand(1);
- Value *Alignment = CI->getArgOperand(2);
- Value *Mask = CI->getArgOperand(3);
-
- assert(isa<VectorType>(Src->getType()) &&
- "Unexpected data type in masked scatter intrinsic");
- assert(isa<VectorType>(Ptrs->getType()) &&
- isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
- "Vector of pointers is expected in masked scatter intrinsic");
-
- IRBuilder<> Builder(CI->getContext());
- Instruction *InsertPt = CI;
- BasicBlock *IfBlock = CI->getParent();
- Builder.SetInsertPoint(InsertPt);
- Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
- unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
- unsigned VectorWidth = Src->getType()->getVectorNumElements();
-
- // Shorten the way if the mask is a vector of constants.
- bool IsConstMask = isa<ConstantVector>(Mask);
-
- if (IsConstMask) {
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
- if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
- continue;
- Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
- "Elt" + Twine(Idx));
- Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
- "Ptr" + Twine(Idx));
- Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
- }
- CI->eraseFromParent();
- return;
- }
- for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
- // Fill the "else" block, created in the previous iteration
- //
- // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
- // % ToStore = icmp eq i1 % Mask1, true
- // br i1 % ToStore, label %cond.store, label %else
- //
- Value *Predicate = Builder.CreateExtractElement(Mask,
- Builder.getInt32(Idx),
- "Mask" + Twine(Idx));
- Value *Cmp =
- Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
- ConstantInt::get(Predicate->getType(), 1),
- "ToStore" + Twine(Idx));
-
- // Create "cond" block
- //
- // % Elt1 = extractelement <16 x i32> %Src, i32 1
- // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
- // %store i32 % Elt1, i32* % Ptr1
- //
- BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
- Builder.SetInsertPoint(InsertPt);
-
- Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
- "Elt" + Twine(Idx));
- Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
- "Ptr" + Twine(Idx));
- Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
-
- // Create "else" block, fill it in the next iteration
- BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
- Builder.SetInsertPoint(InsertPt);
- Instruction *OldBr = IfBlock->getTerminator();
- BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
- OldBr->eraseFromParent();
- IfBlock = NewIfBlock;
- }
- CI->eraseFromParent();
-}
-
/// If counting leading or trailing zeros is an expensive operation and a zero
/// input is defined, add a check for zero to avoid calling the intrinsic.
///
@@ -2242,39 +1729,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
}
return true;
}
- case Intrinsic::masked_load: {
- // Scalarize unsupported vector masked load
- if (!TTI->isLegalMaskedLoad(CI->getType())) {
- scalarizeMaskedLoad(CI);
- ModifiedDT = true;
- return true;
- }
- return false;
- }
- case Intrinsic::masked_store: {
- if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
- scalarizeMaskedStore(CI);
- ModifiedDT = true;
- return true;
- }
- return false;
- }
- case Intrinsic::masked_gather: {
- if (!TTI->isLegalMaskedGather(CI->getType())) {
- scalarizeMaskedGather(CI);
- ModifiedDT = true;
- return true;
- }
- return false;
- }
- case Intrinsic::masked_scatter: {
- if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
- scalarizeMaskedScatter(CI);
- ModifiedDT = true;
- return true;
- }
- return false;
- }
case Intrinsic::aarch64_stlxr:
case Intrinsic::aarch64_stxr: {
ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index ab2382e2db6d..e860906043dd 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -142,8 +142,9 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
MachineOperand &DstMO = MI->getOperand(0);
MachineOperand &SrcMO = MI->getOperand(1);
- if (SrcMO.getReg() == DstMO.getReg()) {
- DEBUG(dbgs() << "identity copy: " << *MI);
+ bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg());
+ if (IdentityCopy || SrcMO.isUndef()) {
+ DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy: ") << *MI);
// No need to insert an identity copy instruction, but replace with a KILL
// if liveness is changed.
if (SrcMO.isUndef() || MI->getNumOperands() > 2) {
diff --git a/lib/CodeGen/ExpandReductions.cpp b/lib/CodeGen/ExpandReductions.cpp
new file mode 100644
index 000000000000..a40ea28056dd
--- /dev/null
+++ b/lib/CodeGen/ExpandReductions.cpp
@@ -0,0 +1,167 @@
+//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for reduction intrinsics, allowing targets
+// to enable the experimental intrinsics until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExpandReductions.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+unsigned getOpcode(Intrinsic::ID ID) {
+ switch (ID) {
+ case Intrinsic::experimental_vector_reduce_fadd:
+ return Instruction::FAdd;
+ case Intrinsic::experimental_vector_reduce_fmul:
+ return Instruction::FMul;
+ case Intrinsic::experimental_vector_reduce_add:
+ return Instruction::Add;
+ case Intrinsic::experimental_vector_reduce_mul:
+ return Instruction::Mul;
+ case Intrinsic::experimental_vector_reduce_and:
+ return Instruction::And;
+ case Intrinsic::experimental_vector_reduce_or:
+ return Instruction::Or;
+ case Intrinsic::experimental_vector_reduce_xor:
+ return Instruction::Xor;
+ case Intrinsic::experimental_vector_reduce_smax:
+ case Intrinsic::experimental_vector_reduce_smin:
+ case Intrinsic::experimental_vector_reduce_umax:
+ case Intrinsic::experimental_vector_reduce_umin:
+ return Instruction::ICmp;
+ case Intrinsic::experimental_vector_reduce_fmax:
+ case Intrinsic::experimental_vector_reduce_fmin:
+ return Instruction::FCmp;
+ default:
+ llvm_unreachable("Unexpected ID");
+ }
+}
+
+RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {
+ switch (ID) {
+ case Intrinsic::experimental_vector_reduce_smax:
+ return RecurrenceDescriptor::MRK_SIntMax;
+ case Intrinsic::experimental_vector_reduce_smin:
+ return RecurrenceDescriptor::MRK_SIntMin;
+ case Intrinsic::experimental_vector_reduce_umax:
+ return RecurrenceDescriptor::MRK_UIntMax;
+ case Intrinsic::experimental_vector_reduce_umin:
+ return RecurrenceDescriptor::MRK_UIntMin;
+ case Intrinsic::experimental_vector_reduce_fmax:
+ return RecurrenceDescriptor::MRK_FloatMax;
+ case Intrinsic::experimental_vector_reduce_fmin:
+ return RecurrenceDescriptor::MRK_FloatMin;
+ default:
+ return RecurrenceDescriptor::MRK_Invalid;
+ }
+}
+
+bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
+ bool Changed = false;
+ SmallVector<IntrinsicInst*, 4> Worklist;
+ for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+ if (auto II = dyn_cast<IntrinsicInst>(&*I))
+ Worklist.push_back(II);
+
+ for (auto *II : Worklist) {
+ IRBuilder<> Builder(II);
+ Value *Vec = nullptr;
+ auto ID = II->getIntrinsicID();
+ auto MRK = RecurrenceDescriptor::MRK_Invalid;
+ switch (ID) {
+ case Intrinsic::experimental_vector_reduce_fadd:
+ case Intrinsic::experimental_vector_reduce_fmul:
+ // FMFs must be attached to the call, otherwise it's an ordered reduction
+ // and it can't be handled by generating this shuffle sequence.
+ // TODO: Implement scalarization of ordered reductions here for targets
+ // without native support.
+ if (!II->getFastMathFlags().unsafeAlgebra())
+ continue;
+ Vec = II->getArgOperand(1);
+ break;
+ case Intrinsic::experimental_vector_reduce_add:
+ case Intrinsic::experimental_vector_reduce_mul:
+ case Intrinsic::experimental_vector_reduce_and:
+ case Intrinsic::experimental_vector_reduce_or:
+ case Intrinsic::experimental_vector_reduce_xor:
+ case Intrinsic::experimental_vector_reduce_smax:
+ case Intrinsic::experimental_vector_reduce_smin:
+ case Intrinsic::experimental_vector_reduce_umax:
+ case Intrinsic::experimental_vector_reduce_umin:
+ case Intrinsic::experimental_vector_reduce_fmax:
+ case Intrinsic::experimental_vector_reduce_fmin:
+ Vec = II->getArgOperand(0);
+ MRK = getMRK(ID);
+ break;
+ default:
+ continue;
+ }
+ if (!TTI->shouldExpandReduction(II))
+ continue;
+ auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
+ II->replaceAllUsesWith(Rdx);
+ II->eraseFromParent();
+ Changed = true;
+ }
+ return Changed;
+}
+
+class ExpandReductions : public FunctionPass {
+public:
+ static char ID;
+ ExpandReductions() : FunctionPass(ID) {
+ initializeExpandReductionsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return expandReductions(F, TTI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+}
+
+char ExpandReductions::ID;
+INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
+ "Expand reduction intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
+ "Expand reduction intrinsics", false, false)
+
+FunctionPass *llvm::createExpandReductionsPass() {
+ return new ExpandReductions();
+}
+
+PreservedAnalyses ExpandReductionsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ if (!expandReductions(F, &TTI))
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index eaf4056e47ea..4d4591042296 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -162,7 +162,7 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI,
return std::get<0>(getAction(MI, MRI)) == Legal;
}
-LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
+Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect,
LegalizeAction Action) const {
switch(Action) {
default:
@@ -174,20 +174,20 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
return Aspect.Type;
case NarrowScalar: {
return findLegalType(Aspect,
- [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
+ [](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
}
case WidenScalar: {
- return findLegalType(Aspect, [&](LLT Ty) -> LLT {
+ return findLegalType(Aspect, [](LLT Ty) -> LLT {
return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize();
});
}
case FewerElements: {
return findLegalType(Aspect,
- [&](LLT Ty) -> LLT { return Ty.halfElements(); });
+ [](LLT Ty) -> LLT { return Ty.halfElements(); });
}
case MoreElements: {
return findLegalType(Aspect,
- [&](LLT Ty) -> LLT { return Ty.doubleElements(); });
+ [](LLT Ty) -> LLT { return Ty.doubleElements(); });
}
}
}
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 7248f50945d0..2eb3cdee694d 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -204,12 +204,8 @@ uint64_t RegBankSelect::getRepairCost(
// TODO: use a dedicated constant for ImpossibleCost.
if (Cost != UINT_MAX)
return Cost;
- assert(!TPC->isGlobalISelAbortEnabled() &&
- "Legalization not available yet");
// Return the legalization cost of that repairing.
}
- assert(!TPC->isGlobalISelAbortEnabled() &&
- "Complex repairing not implemented yet");
return UINT_MAX;
}
@@ -452,6 +448,11 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
// Sums up the repairing cost of MO at each insertion point.
uint64_t RepairCost = getRepairCost(MO, ValMapping);
+
+ // This is an impossible to repair cost.
+ if (RepairCost == UINT_MAX)
+ continue;
+
// Bias used for splitting: 5%.
const uint64_t PercentageForBias = 5;
uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100;
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index 3c93f8123b0d..254bdf10d804 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -110,3 +110,11 @@ Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg,
return None;
}
+
+const llvm::ConstantFP* llvm::getConstantFPVRegVal(unsigned VReg,
+ const MachineRegisterInfo &MRI) {
+ MachineInstr *MI = MRI.getVRegDef(VReg);
+ if (TargetOpcode::G_FCONSTANT != MI->getOpcode())
+ return nullptr;
+ return MI->getOperand(1).getFPImm();
+}
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 37fe41582333..628d599a3cc7 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -1318,7 +1318,8 @@ static bool canFallThroughTo(MachineBasicBlock &MBB, MachineBasicBlock &ToMBB) {
return false;
PI = I++;
}
- return true;
+ // Finally see if the last I is indeed a successor to PI.
+ return PI->isSuccessor(&*I);
}
/// Invalidate predecessor BB info so it would be re-analyzed to determine if it
@@ -1587,22 +1588,32 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB);
}
+ // To be able to insert code freely at the end of BBI we sometimes remove
+ // the branch from BBI to NextMBB temporarily. Remember if this happened.
+ bool RemovedBranchToNextMBB = false;
if (CvtMBB.pred_size() > 1) {
BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
// Copy instructions in the true block, predicate them, and add them to
// the entry block.
CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
- // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
- // explicitly remove CvtBBI as a successor.
+ // Keep the CFG updated.
BBI.BB->removeSuccessor(&CvtMBB, true);
} else {
// Predicate the 'true' block after removing its branch.
CvtBBI->NonPredSize -= TII->removeBranch(CvtMBB);
PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
- // Now merge the entry of the triangle with the true block.
+ // Remove the branch from the entry of the triangle to NextBB to be able to
+ // do the merge below. Keep the CFG updated, but remember we removed the
+ // branch since we do want to execute NextMBB, either by introducing a
+ // branch to it again, or merging it into the entry block.
+ // How it's handled is decided further down.
BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+ BBI.BB->removeSuccessor(&NextMBB, true);
+ RemovedBranchToNextMBB = true;
+
+ // Now merge the entry of the triangle with the true block.
MergeBlocks(BBI, *CvtBBI, false);
}
@@ -1640,12 +1651,19 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
// block. By not merging them, we make it possible to iteratively
// ifcvt the blocks.
if (!HasEarlyExit &&
- NextMBB.pred_size() == 1 && !NextBBI->HasFallThrough &&
+ // We might have removed BBI from NextMBB's predecessor list above but
+ // we want it to be there, so consider that too.
+ (NextMBB.pred_size() == (RemovedBranchToNextMBB ? 0 : 1)) &&
+ !NextBBI->HasFallThrough &&
!NextMBB.hasAddressTaken()) {
+ // We will merge NextBBI into BBI, and thus remove the current
+ // fallthrough from BBI into CvtBBI.
+ BBI.BB->removeSuccessor(&CvtMBB, true);
MergeBlocks(BBI, *NextBBI);
FalseBBDead = true;
} else {
InsertUncondBranch(*BBI.BB, NextMBB, TII);
+ BBI.BB->addSuccessor(&NextMBB);
BBI.HasFallThrough = false;
}
// Mixed predicated and unpredicated code. This cannot be iteratively
@@ -1653,8 +1671,6 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
IterIfcvt = false;
}
- RemoveExtraEdges(BBI);
-
// Update block info. BB can be iteratively if-converted.
if (!IterIfcvt)
BBI.IsDone = true;
diff --git a/lib/CodeGen/LiveRangeShrink.cpp b/lib/CodeGen/LiveRangeShrink.cpp
new file mode 100644
index 000000000000..00182e2c779f
--- /dev/null
+++ b/lib/CodeGen/LiveRangeShrink.cpp
@@ -0,0 +1,211 @@
+//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+///===---------------------------------------------------------------------===//
+///
+/// \file
+/// This pass moves instructions close to the definition of its operands to
+/// shrink live range of the def instruction. The code motion is limited within
+/// the basic block. The moved instruction should have 1 def, and more than one
+/// uses, all of which are the only use of the def.
+///
+///===---------------------------------------------------------------------===//
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "lrshrink"
+
+STATISTIC(NumInstrsHoistedToShrinkLiveRange,
+ "Number of insructions hoisted to shrink live range.");
+
+using namespace llvm;
+
+namespace {
+class LiveRangeShrink : public MachineFunctionPass {
+public:
+ static char ID;
+
+ LiveRangeShrink() : MachineFunctionPass(ID) {
+ initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return "Live Range Shrink"; }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char LiveRangeShrink::ID = 0;
+char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID;
+
+INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false,
+ false)
+namespace {
+typedef DenseMap<MachineInstr *, unsigned> InstOrderMap;
+
+/// Returns \p New if it's dominated by \p Old, otherwise return \p Old.
+/// \p M maintains a map from instruction to its dominating order that satisfies
+/// M[A] > M[B] guarantees that A is dominated by B.
+/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return
+/// \p New.
+MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old,
+ const InstOrderMap &M) {
+ auto NewIter = M.find(&New);
+ if (NewIter == M.end())
+ return Old;
+ if (Old == nullptr)
+ return &New;
+ unsigned OrderOld = M.find(Old)->second;
+ unsigned OrderNew = NewIter->second;
+ if (OrderOld != OrderNew)
+ return OrderOld < OrderNew ? &New : Old;
+ // OrderOld == OrderNew, we need to iterate down from Old to see if it
+ // can reach New, if yes, New is dominated by Old.
+ for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew;
+ I = I->getNextNode())
+ if (I == &New)
+ return &New;
+ return Old;
+}
+
+/// Builds Instruction to its dominating order number map \p M by traversing
+/// from instruction \p Start.
+void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) {
+ M.clear();
+ unsigned i = 0;
+ for (MachineInstr &I : make_range(Start, Start->getParent()->end()))
+ M[&I] = i++;
+}
+} // end anonymous namespace
+
+bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+
+ InstOrderMap IOM;
+ // Map from register to instruction order (value of IOM) where the
+ // register is used last. When moving instructions up, we need to
+ // make sure all its defs (including dead def) will not cross its
+ // last use when moving up.
+ DenseMap<unsigned, unsigned> UseMap;
+
+ for (MachineBasicBlock &MBB : MF) {
+ if (MBB.empty())
+ continue;
+ bool SawStore = false;
+ BuildInstOrderMap(MBB.begin(), IOM);
+ UseMap.clear();
+
+ for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) {
+ MachineInstr &MI = *Next;
+ ++Next;
+ if (MI.isPHI() || MI.isDebugValue())
+ continue;
+ if (MI.mayStore())
+ SawStore = true;
+
+ unsigned CurrentOrder = IOM[&MI];
+ unsigned Barrier = 0;
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || MO.isDebug())
+ continue;
+ if (MO.isUse())
+ UseMap[MO.getReg()] = CurrentOrder;
+ else if (MO.isDead() && UseMap.count(MO.getReg()))
+ // Barrier is the last instruction where MO get used. MI should not
+ // be moved above Barrier.
+ Barrier = std::max(Barrier, UseMap[MO.getReg()]);
+ }
+
+ if (!MI.isSafeToMove(nullptr, SawStore)) {
+ // If MI has side effects, it should become a barrier for code motion.
+ // IOM is rebuild from the next instruction to prevent later
+ // instructions from being moved before this MI.
+ if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
+ BuildInstOrderMap(Next, IOM);
+ SawStore = false;
+ }
+ continue;
+ }
+
+ const MachineOperand *DefMO = nullptr;
+ MachineInstr *Insert = nullptr;
+
+ // Number of live-ranges that will be shortened. We do not count
+ // live-ranges that are defined by a COPY as it could be coalesced later.
+ unsigned NumEligibleUse = 0;
+
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg() || MO.isDead() || MO.isDebug())
+ continue;
+ unsigned Reg = MO.getReg();
+ // Do not move the instruction if it def/uses a physical register,
+ // unless it is a constant physical register.
+ if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+ !MRI.isConstantPhysReg(Reg)) {
+ Insert = nullptr;
+ break;
+ }
+ if (MO.isDef()) {
+ // Do not move if there is more than one def.
+ if (DefMO) {
+ Insert = nullptr;
+ break;
+ }
+ DefMO = &MO;
+ } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg)) {
+ MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
+ if (!DefInstr.isCopy())
+ NumEligibleUse++;
+ Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
+ } else {
+ Insert = nullptr;
+ break;
+ }
+ }
+ // Move the instruction when # of shrunk live range > 1.
+ if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) {
+ MachineBasicBlock::iterator I = std::next(Insert->getIterator());
+ // Skip all the PHI and debug instructions.
+ while (I != MBB.end() && (I->isPHI() || I->isDebugValue()))
+ I = std::next(I);
+ if (I == MI.getIterator())
+ continue;
+
+ // Update the dominator order to be the same as the insertion point.
+ // We do this to maintain a non-decreasing order without need to update
+ // all instruction orders after the insertion point.
+ unsigned NewOrder = IOM[&*I];
+ IOM[&MI] = NewOrder;
+ NumInstrsHoistedToShrinkLiveRange++;
+
+ // Find MI's debug value following MI.
+ MachineBasicBlock::iterator EndIter = std::next(MI.getIterator());
+ if (MI.getOperand(0).isReg())
+ for (; EndIter != MBB.end() && EndIter->isDebugValue() &&
+ EndIter->getOperand(0).isReg() &&
+ EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg();
+ ++EndIter, ++Next)
+ IOM[&*EndIter] = NewOrder;
+ MBB.splice(I, &MBB, MI.getIterator(), EndIter);
+ }
+ }
+ }
+ return false;
+}
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 3568b0294ad9..a9aec926115a 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -767,7 +767,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
MachineBasicBlock *SuccBB) {
const unsigned NumNew = BB->getNumber();
- SmallSet<unsigned, 16> Defs, Kills;
+ DenseSet<unsigned> Defs, Kills;
MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end();
for (; BBI != BBE && BBI->isPHI(); ++BBI) {
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 4cfc128a8c1d..5003115a770f 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -133,6 +133,14 @@ static cl::opt<unsigned> TailDupPlacementThreshold(
"that won't conflict."), cl::init(2),
cl::Hidden);
+// Heuristic for aggressive tail duplication.
+static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
+ "tail-dup-placement-aggressive-threshold",
+ cl::desc("Instruction cutoff for aggressive tail duplication during "
+ "layout. Used at -O3. Tail merging during layout is forced to "
+ "have a threshold that won't conflict."), cl::init(3),
+ cl::Hidden);
+
// Heuristic for tail duplication.
static cl::opt<unsigned> TailDupPlacementPenalty(
"tail-dup-placement-penalty",
@@ -2646,9 +2654,26 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
assert(BlockToChain.empty());
assert(ComputedEdges.empty());
+ unsigned TailDupSize = TailDupPlacementThreshold;
+ // If only the aggressive threshold is explicitly set, use it.
+ if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
+ TailDupPlacementThreshold.getNumOccurrences() == 0)
+ TailDupSize = TailDupPlacementAggressiveThreshold;
+
+ TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+ // For agressive optimization, we can adjust some thresholds to be less
+ // conservative.
+ if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
+ // At O3 we should be more willing to copy blocks for tail duplication. This
+ // increases size pressure, so we only do it at O3
+ // Do this unless only the regular threshold is explicitly set.
+ if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
+ TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
+ TailDupSize = TailDupPlacementAggressiveThreshold;
+ }
+
if (TailDupPlacement) {
MPDT = &getAnalysis<MachinePostDominatorTree>();
- unsigned TailDupSize = TailDupPlacementThreshold;
if (MF.getFunction()->optForSize())
TailDupSize = 1;
TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
@@ -2658,7 +2683,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
buildCFGChains();
// Changing the layout can create new tail merging opportunities.
- TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
// TailMerge can create jump into if branches that make CFG irreducible for
// HW that requires structured CFG.
bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
@@ -2666,7 +2690,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
BranchFoldPlacement;
// No tail merging opportunities if the block number is less than four.
if (MF.size() > 3 && EnableTailMerge) {
- unsigned TailMergeSize = TailDupPlacementThreshold + 1;
+ unsigned TailMergeSize = TailDupSize + 1;
BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
*MBPI, TailMergeSize);
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index bfb2cde030dc..ab433273b189 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -2063,12 +2063,12 @@ void MachineVerifier::verifyStackFrame() {
if (I.getOpcode() == FrameSetupOpcode) {
if (BBState.ExitIsSetup)
report("FrameSetup is after another FrameSetup", &I);
- BBState.ExitValue -= TII->getFrameSize(I);
+ BBState.ExitValue -= TII->getFrameTotalSize(I);
BBState.ExitIsSetup = true;
}
if (I.getOpcode() == FrameDestroyOpcode) {
- int Size = TII->getFrameSize(I);
+ int Size = TII->getFrameTotalSize(I);
if (!BBState.ExitIsSetup)
report("FrameDestroy is not after a FrameSetup", &I);
int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue :
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index c67a25b888bf..db2264b2439d 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -34,7 +34,7 @@
#include <algorithm>
using namespace llvm;
-#define DEBUG_TYPE "phielim"
+#define DEBUG_TYPE "phi-node-elimination"
static cl::opt<bool>
DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false),
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index bf44ee8453b6..1803ea2b9249 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -3214,7 +3214,7 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
CurrList(WorkList.begin() + PrevSize, WorkList.end());
if (copyCoalesceWorkList(CurrList))
WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
- (MachineInstr*)nullptr), WorkList.end());
+ nullptr), WorkList.end());
}
void RegisterCoalescer::coalesceLocals() {
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 35db30f89976..0635e5c0a63c 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -62,10 +62,9 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
}
this->MBB = &MBB;
- for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
- IE = Scavenged.end(); I != IE; ++I) {
- I->Reg = 0;
- I->Restore = nullptr;
+ for (ScavengedInfo &SI : Scavenged) {
+ SI.Reg = 0;
+ SI.Restore = nullptr;
}
Tracking = false;
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index 7fa379d80c6c..08b3d345f689 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -19,6 +19,7 @@
#include "SafeStackLayout.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -92,11 +93,11 @@ public:
/// determined statically), and the unsafe stack, which contains all
/// local variables that are accessed in ways that we can't prove to
/// be safe.
-class SafeStack : public FunctionPass {
- const TargetMachine *TM;
- const TargetLoweringBase *TL;
- const DataLayout *DL;
- ScalarEvolution *SE;
+class SafeStack {
+ Function &F;
+ const TargetLoweringBase &TL;
+ const DataLayout &DL;
+ ScalarEvolution &SE;
Type *StackPtrTy;
Type *IntPtrTy;
@@ -171,33 +172,21 @@ class SafeStack : public FunctionPass {
uint64_t AllocaSize);
public:
- static char ID; // Pass identification, replacement for typeid.
- SafeStack(const TargetMachine *TM)
- : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) {
- initializeSafeStackPass(*PassRegistry::getPassRegistry());
- }
- SafeStack() : SafeStack(nullptr) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<ScalarEvolutionWrapperPass>();
- }
-
- bool doInitialization(Module &M) override {
- DL = &M.getDataLayout();
-
- StackPtrTy = Type::getInt8PtrTy(M.getContext());
- IntPtrTy = DL->getIntPtrType(M.getContext());
- Int32Ty = Type::getInt32Ty(M.getContext());
- Int8Ty = Type::getInt8Ty(M.getContext());
-
- return false;
- }
-
- bool runOnFunction(Function &F) override;
-}; // class SafeStack
+ SafeStack(Function &F, const TargetLoweringBase &TL, const DataLayout &DL,
+ ScalarEvolution &SE)
+ : F(F), TL(TL), DL(DL), SE(SE),
+ StackPtrTy(Type::getInt8PtrTy(F.getContext())),
+ IntPtrTy(DL.getIntPtrType(F.getContext())),
+ Int32Ty(Type::getInt32Ty(F.getContext())),
+ Int8Ty(Type::getInt8Ty(F.getContext())) {}
+
+ // Run the transformation on the associated function.
+ // Returns whether the function was changed.
+ bool run();
+};
uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
- uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType());
+ uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
if (AI->isArrayAllocation()) {
auto C = dyn_cast<ConstantInt>(AI->getArraySize());
if (!C)
@@ -209,11 +198,11 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
const Value *AllocaPtr, uint64_t AllocaSize) {
- AllocaOffsetRewriter Rewriter(*SE, AllocaPtr);
- const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr));
+ AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
+ const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
- uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType());
- ConstantRange AccessStartRange = SE->getUnsignedRange(Expr);
+ uint64_t BitWidth = SE.getTypeSizeInBits(Expr->getType());
+ ConstantRange AccessStartRange = SE.getUnsignedRange(Expr);
ConstantRange SizeRange =
ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize));
ConstantRange AccessRange = AccessStartRange.add(SizeRange);
@@ -226,8 +215,8 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
<< *AllocaPtr << "\n"
<< " Access " << *Addr << "\n"
<< " SCEV " << *Expr
- << " U: " << SE->getUnsignedRange(Expr)
- << ", S: " << SE->getSignedRange(Expr) << "\n"
+ << " U: " << SE.getUnsignedRange(Expr)
+ << ", S: " << SE.getSignedRange(Expr) << "\n"
<< " Range " << AccessRange << "\n"
<< " AllocaRange " << AllocaRange << "\n"
<< " " << (Safe ? "safe" : "unsafe") << "\n");
@@ -266,7 +255,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
switch (I->getOpcode()) {
case Instruction::Load: {
- if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr,
+ if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getType()), AllocaPtr,
AllocaSize))
return false;
break;
@@ -282,7 +271,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
return false;
}
- if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()),
+ if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getOperand(0)->getType()),
AllocaPtr, AllocaSize))
return false;
break;
@@ -343,7 +332,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
}
Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
- Value *StackGuardVar = TL->getIRStackGuard(IRB);
+ Value *StackGuardVar = TL.getIRStackGuard(IRB);
if (!StackGuardVar)
StackGuardVar =
F.getParent()->getOrInsertGlobal("__stack_chk_guard", StackPtrTy);
@@ -390,7 +379,7 @@ void SafeStack::findInsts(Function &F,
if (!Arg.hasByValAttr())
continue;
uint64_t Size =
- DL->getTypeStoreSize(Arg.getType()->getPointerElementType());
+ DL.getTypeStoreSize(Arg.getType()->getPointerElementType());
if (IsSafeStackAlloca(&Arg, Size))
continue;
@@ -476,19 +465,19 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
if (StackGuardSlot) {
Type *Ty = StackGuardSlot->getAllocatedType();
unsigned Align =
- std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
+ std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
Align, SSC.getFullLiveRange());
}
for (Argument *Arg : ByValArguments) {
Type *Ty = Arg->getType()->getPointerElementType();
- uint64_t Size = DL->getTypeStoreSize(Ty);
+ uint64_t Size = DL.getTypeStoreSize(Ty);
if (Size == 0)
Size = 1; // Don't create zero-sized stack objects.
// Ensure the object is properly aligned.
- unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+ unsigned Align = std::max((unsigned)DL.getPrefTypeAlignment(Ty),
Arg->getParamAlignment());
SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange());
}
@@ -501,7 +490,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
// Ensure the object is properly aligned.
unsigned Align =
- std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment());
+ std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment());
SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI));
}
@@ -539,7 +528,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
unsigned Offset = SSL.getObjectOffset(Arg);
Type *Ty = Arg->getType()->getPointerElementType();
- uint64_t Size = DL->getTypeStoreSize(Ty);
+ uint64_t Size = DL.getTypeStoreSize(Ty);
if (Size == 0)
Size = 1; // Don't create zero-sized stack objects.
@@ -630,7 +619,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false);
Type *Ty = AI->getAllocatedType();
- uint64_t TySize = DL->getTypeAllocSize(Ty);
+ uint64_t TySize = DL.getTypeAllocSize(Ty);
Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize));
Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(UnsafeStackPtr), IntPtrTy);
@@ -638,7 +627,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
// Align the SP value to satisfy the AllocaInst, type and stack alignments.
unsigned Align = std::max(
- std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()),
+ std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()),
(unsigned)StackAlignment);
assert(isPowerOf2_32(Align));
@@ -685,25 +674,10 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
}
}
-bool SafeStack::runOnFunction(Function &F) {
- DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
-
- if (!F.hasFnAttribute(Attribute::SafeStack)) {
- DEBUG(dbgs() << "[SafeStack] safestack is not requested"
- " for this function\n");
- return false;
- }
-
- if (F.isDeclaration()) {
- DEBUG(dbgs() << "[SafeStack] function definition"
- " is not available\n");
- return false;
- }
-
- if (!TM)
- report_fatal_error("Target machine is required");
- TL = TM->getSubtargetImpl(F)->getTargetLowering();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+bool SafeStack::run() {
+ assert(F.hasFnAttribute(Attribute::SafeStack) &&
+ "Can't run SafeStack on a function without the attribute");
+ assert(!F.isDeclaration() && "Can't run SafeStack on a function declaration");
++NumFunctions;
@@ -736,7 +710,7 @@ bool SafeStack::runOnFunction(Function &F) {
++NumUnsafeStackRestorePointsFunctions;
IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt());
- UnsafeStackPtr = TL->getSafeStackPointerLocation(IRB);
+ UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB);
// Load the current stack pointer (we'll also use it as a base pointer).
// FIXME: use a dedicated register for it ?
@@ -788,14 +762,70 @@ bool SafeStack::runOnFunction(Function &F) {
return true;
}
+class SafeStackLegacyPass : public FunctionPass {
+ const TargetMachine *TM;
+
+public:
+ static char ID; // Pass identification, replacement for typeid..
+ SafeStackLegacyPass(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {
+ initializeSafeStackLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ SafeStackLegacyPass() : SafeStackLegacyPass(nullptr) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ }
+
+ bool runOnFunction(Function &F) override {
+ DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
+
+ if (!F.hasFnAttribute(Attribute::SafeStack)) {
+ DEBUG(dbgs() << "[SafeStack] safestack is not requested"
+ " for this function\n");
+ return false;
+ }
+
+ if (F.isDeclaration()) {
+ DEBUG(dbgs() << "[SafeStack] function definition"
+ " is not available\n");
+ return false;
+ }
+
+ if (!TM)
+ report_fatal_error("Target machine is required");
+ auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+ if (!TL)
+ report_fatal_error("TargetLowering instance is required");
+
+ auto *DL = &F.getParent()->getDataLayout();
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+ // Compute DT and LI only for functions that have the attribute.
+ // This is only useful because the legacy pass manager doesn't let us
+ // compute analyzes lazily.
+ // In the backend pipeline, nothing preserves DT before SafeStack, so we
+ // would otherwise always compute it wastefully, even if there is no
+ // function with the safestack attribute.
+ DominatorTree DT(F);
+ LoopInfo LI(DT);
+
+ ScalarEvolution SE(F, TLI, ACT, DT, LI);
+
+ return SafeStack(F, *TL, *DL, SE).run();
+ }
+};
+
} // anonymous namespace
-char SafeStack::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack",
+char SafeStackLegacyPass::ID = 0;
+INITIALIZE_TM_PASS_BEGIN(SafeStackLegacyPass, "safe-stack",
"Safe Stack instrumentation pass", false, false)
-INITIALIZE_TM_PASS_END(SafeStack, "safe-stack",
+INITIALIZE_TM_PASS_END(SafeStackLegacyPass, "safe-stack",
"Safe Stack instrumentation pass", false, false)
FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) {
- return new SafeStack(TM);
+ return new SafeStackLegacyPass(TM);
}
diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
new file mode 100644
index 000000000000..dab5b91f50ad
--- /dev/null
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -0,0 +1,660 @@
+//=== ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ===//
+//=== instrinsics ===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces masked memory intrinsics - when unsupported by the target
+// - with a chain of basic blocks, that deal with the elements one-by-one if the
+// appropriate mask bit is set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarize-masked-mem-intrin"
+
+namespace {
+
+class ScalarizeMaskedMemIntrin : public FunctionPass {
+ const TargetTransformInfo *TTI;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID), TTI(nullptr) {
+ initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ return "Scalarize Masked Memory Intrinsics";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+private:
+ bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
+ bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
+};
+} // namespace
+
+char ScalarizeMaskedMemIntrin::ID = 0;
+INITIALIZE_PASS_BEGIN(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
+ "Scalarize unsupported masked memory intrinsics", false,
+ false)
+INITIALIZE_PASS_END(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
+ "Scalarize unsupported masked memory intrinsics", false,
+ false)
+
+FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
+ return new ScalarizeMaskedMemIntrin();
+}
+
+// Translate a masked load intrinsic like
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+// <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// %1 = bitcast i8* %addr to i32*
+// %2 = extractelement <16 x i1> %mask, i32 0
+// %3 = icmp eq i1 %2, true
+// br i1 %3, label %cond.load, label %else
+//
+// cond.load: ; preds = %0
+// %4 = getelementptr i32* %1, i32 0
+// %5 = load i32* %4
+// %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+// br label %else
+//
+// else: ; preds = %0, %cond.load
+// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
+// %7 = extractelement <16 x i1> %mask, i32 1
+// %8 = icmp eq i1 %7, true
+// br i1 %8, label %cond.load1, label %else2
+//
+// cond.load1: ; preds = %else
+// %9 = getelementptr i32* %1, i32 1
+// %10 = load i32* %9
+// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
+// br label %else2
+//
+// else2: ; preds = %else, %cond.load1
+// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+// %12 = extractelement <16 x i1> %mask, i32 2
+// %13 = icmp eq i1 %12, true
+// br i1 %13, label %cond.load4, label %else5
+//
+static void scalarizeMaskedLoad(CallInst *CI) {
+ Value *Ptr = CI->getArgOperand(0);
+ Value *Alignment = CI->getArgOperand(1);
+ Value *Mask = CI->getArgOperand(2);
+ Value *Src0 = CI->getArgOperand(3);
+
+ unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+ VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+ assert(VecType && "Unexpected return type of masked load intrinsic");
+
+ Type *EltTy = CI->getType()->getVectorElementType();
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ BasicBlock *CondBlock = nullptr;
+ BasicBlock *PrevIfBlock = CI->getParent();
+
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ // Short-cut if the mask is all-true.
+ bool IsAllOnesMask =
+ isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue();
+
+ if (IsAllOnesMask) {
+ Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+ return;
+ }
+
+ // Adjust alignment for the scalar instruction.
+ AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits() / 8);
+ // Bitcast %addr fron i8* to EltTy*
+ Type *NewPtrType =
+ EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+ Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+ unsigned VectorWidth = VecType->getNumElements();
+
+ Value *UndefVal = UndefValue::get(VecType);
+
+ // The result vector
+ Value *VResult = UndefVal;
+
+ if (isa<ConstantVector>(Mask)) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+ continue;
+ Value *Gep =
+ Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+ LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+ VResult =
+ Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+ }
+ Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+ return;
+ }
+
+ PHINode *Phi = nullptr;
+ Value *PrevPhi = UndefVal;
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+ // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+ // %to_load = icmp eq i1 %mask_1, true
+ // br i1 %to_load, label %cond.load, label %else
+ //
+ if (Idx > 0) {
+ Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+ Phi->addIncoming(VResult, CondBlock);
+ Phi->addIncoming(PrevPhi, PrevIfBlock);
+ PrevPhi = Phi;
+ VResult = Phi;
+ }
+
+ Value *Predicate =
+ Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+ ConstantInt::get(Predicate->getType(), 1));
+
+ // Create "cond" block
+ //
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %Elt = load i32* %EltAddr
+ // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+ //
+ CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *Gep =
+ Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+ LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+ VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock =
+ CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+ }
+
+ Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+ Phi->addIncoming(VResult, CondBlock);
+ Phi->addIncoming(PrevPhi, PrevIfBlock);
+ Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+}
+
+// Translate a masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+// <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+// %1 = bitcast i8* %addr to i32*
+// %2 = extractelement <16 x i1> %mask, i32 0
+// %3 = icmp eq i1 %2, true
+// br i1 %3, label %cond.store, label %else
+//
+// cond.store: ; preds = %0
+// %4 = extractelement <16 x i32> %val, i32 0
+// %5 = getelementptr i32* %1, i32 0
+// store i32 %4, i32* %5
+// br label %else
+//
+// else: ; preds = %0, %cond.store
+// %6 = extractelement <16 x i1> %mask, i32 1
+// %7 = icmp eq i1 %6, true
+// br i1 %7, label %cond.store1, label %else2
+//
+// cond.store1: ; preds = %else
+// %8 = extractelement <16 x i32> %val, i32 1
+// %9 = getelementptr i32* %1, i32 1
+// store i32 %8, i32* %9
+// br label %else2
+// . . .
+static void scalarizeMaskedStore(CallInst *CI) {
+ Value *Src = CI->getArgOperand(0);
+ Value *Ptr = CI->getArgOperand(1);
+ Value *Alignment = CI->getArgOperand(2);
+ Value *Mask = CI->getArgOperand(3);
+
+ unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+ VectorType *VecType = dyn_cast<VectorType>(Src->getType());
+ assert(VecType && "Unexpected data type in masked store intrinsic");
+
+ Type *EltTy = VecType->getElementType();
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ // Short-cut if the mask is all-true.
+ bool IsAllOnesMask =
+ isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue();
+
+ if (IsAllOnesMask) {
+ Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+ CI->eraseFromParent();
+ return;
+ }
+
+ // Adjust alignment for the scalar instruction.
+ AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits() / 8);
+ // Bitcast %addr fron i8* to EltTy*
+ Type *NewPtrType =
+ EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+ Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+ unsigned VectorWidth = VecType->getNumElements();
+
+ if (isa<ConstantVector>(Mask)) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+ continue;
+ Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+ Value *Gep =
+ Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+ Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+ }
+ CI->eraseFromParent();
+ return;
+ }
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+ // %to_store = icmp eq i1 %mask_1, true
+ // br i1 %to_store, label %cond.store, label %else
+ //
+ Value *Predicate =
+ Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+ ConstantInt::get(Predicate->getType(), 1));
+
+ // Create "cond" block
+ //
+ // %OneElt = extractelement <16 x i32> %Src, i32 Idx
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %store i32 %OneElt, i32* %EltAddr
+ //
+ BasicBlock *CondBlock =
+ IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+ Value *Gep =
+ Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+ Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock =
+ CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
+ CI->eraseFromParent();
+}
+
+// Translate a masked gather intrinsic like
+// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
+// <16 x i1> %Mask, <16 x i32> %Src)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> %Mask, i32 0
+// % ToLoad0 = icmp eq i1 % Mask0, true
+// br i1 % ToLoad0, label %cond.load, label %else
+//
+// cond.load:
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// % Load0 = load i32, i32* % Ptr0, align 4
+// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
+// br label %else
+//
+// else:
+// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
+// % Mask1 = extractelement <16 x i1> %Mask, i32 1
+// % ToLoad1 = icmp eq i1 % Mask1, true
+// br i1 % ToLoad1, label %cond.load1, label %else2
+//
+// cond.load1:
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// % Load1 = load i32, i32* % Ptr1, align 4
+// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
+// br label %else2
+// . . .
+// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
+// ret <16 x i32> %Result
+static void scalarizeMaskedGather(CallInst *CI) {
+ Value *Ptrs = CI->getArgOperand(0);
+ Value *Alignment = CI->getArgOperand(1);
+ Value *Mask = CI->getArgOperand(2);
+ Value *Src0 = CI->getArgOperand(3);
+
+ VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+
+ assert(VecType && "Unexpected return type of masked load intrinsic");
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ BasicBlock *CondBlock = nullptr;
+ BasicBlock *PrevIfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+ unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ Value *UndefVal = UndefValue::get(VecType);
+
+ // The result vector
+ Value *VResult = UndefVal;
+ unsigned VectorWidth = VecType->getNumElements();
+
+ // Shorten the way if the mask is a vector of constants.
+ bool IsConstMask = isa<ConstantVector>(Mask);
+
+ if (IsConstMask) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+ continue;
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+ "Ptr" + Twine(Idx));
+ LoadInst *Load =
+ Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
+ VResult = Builder.CreateInsertElement(
+ VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx));
+ }
+ Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+ return;
+ }
+
+ PHINode *Phi = nullptr;
+ Value *PrevPhi = UndefVal;
+
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+ // Fill the "else" block, created in the previous iteration
+ //
+ // %Mask1 = extractelement <16 x i1> %Mask, i32 1
+ // %ToLoad1 = icmp eq i1 %Mask1, true
+ // br i1 %ToLoad1, label %cond.load, label %else
+ //
+ if (Idx > 0) {
+ Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+ Phi->addIncoming(VResult, CondBlock);
+ Phi->addIncoming(PrevPhi, PrevIfBlock);
+ PrevPhi = Phi;
+ VResult = Phi;
+ }
+
+ Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
+ "Mask" + Twine(Idx));
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+ ConstantInt::get(Predicate->getType(), 1),
+ "ToLoad" + Twine(Idx));
+
+ // Create "cond" block
+ //
+ // %EltAddr = getelementptr i32* %1, i32 0
+ // %Elt = load i32* %EltAddr
+ // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+ //
+ CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+ "Ptr" + Twine(Idx));
+ LoadInst *Load =
+ Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
+ VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
+ "Res" + Twine(Idx));
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ PrevIfBlock = IfBlock;
+ IfBlock = NewIfBlock;
+ }
+
+ Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+ Phi->addIncoming(VResult, CondBlock);
+ Phi->addIncoming(PrevPhi, PrevIfBlock);
+ Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+ CI->replaceAllUsesWith(NewI);
+ CI->eraseFromParent();
+}
+
+// Translate a masked scatter intrinsic, like
+// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
+// <16 x i1> %Mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set.
+//
+// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> % Mask, i32 0
+// % ToStore0 = icmp eq i1 % Mask0, true
+// br i1 %ToStore0, label %cond.store, label %else
+//
+// cond.store:
+// % Elt0 = extractelement <16 x i32> %Src, i32 0
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// store i32 %Elt0, i32* % Ptr0, align 4
+// br label %else
+//
+// else:
+// % Mask1 = extractelement <16 x i1> % Mask, i32 1
+// % ToStore1 = icmp eq i1 % Mask1, true
+// br i1 % ToStore1, label %cond.store1, label %else2
+//
+// cond.store1:
+// % Elt1 = extractelement <16 x i32> %Src, i32 1
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// store i32 % Elt1, i32* % Ptr1, align 4
+// br label %else2
+// . . .
+static void scalarizeMaskedScatter(CallInst *CI) {
+ Value *Src = CI->getArgOperand(0);
+ Value *Ptrs = CI->getArgOperand(1);
+ Value *Alignment = CI->getArgOperand(2);
+ Value *Mask = CI->getArgOperand(3);
+
+ assert(isa<VectorType>(Src->getType()) &&
+ "Unexpected data type in masked scatter intrinsic");
+ assert(isa<VectorType>(Ptrs->getType()) &&
+ isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
+ "Vector of pointers is expected in masked scatter intrinsic");
+
+ IRBuilder<> Builder(CI->getContext());
+ Instruction *InsertPt = CI;
+ BasicBlock *IfBlock = CI->getParent();
+ Builder.SetInsertPoint(InsertPt);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+ unsigned VectorWidth = Src->getType()->getVectorNumElements();
+
+ // Shorten the way if the mask is a vector of constants.
+ bool IsConstMask = isa<ConstantVector>(Mask);
+
+ if (IsConstMask) {
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+ continue;
+ Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+ "Elt" + Twine(Idx));
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+ "Ptr" + Twine(Idx));
+ Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+ }
+ CI->eraseFromParent();
+ return;
+ }
+ for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+ // Fill the "else" block, created in the previous iteration
+ //
+ // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
+ // % ToStore = icmp eq i1 % Mask1, true
+ // br i1 % ToStore, label %cond.store, label %else
+ //
+ Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
+ "Mask" + Twine(Idx));
+ Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+ ConstantInt::get(Predicate->getType(), 1),
+ "ToStore" + Twine(Idx));
+
+ // Create "cond" block
+ //
+ // % Elt1 = extractelement <16 x i32> %Src, i32 1
+ // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+ // %store i32 % Elt1, i32* % Ptr1
+ //
+ BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+ Builder.SetInsertPoint(InsertPt);
+
+ Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+ "Elt" + Twine(Idx));
+ Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+ "Ptr" + Twine(Idx));
+ Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+
+ // Create "else" block, fill it in the next iteration
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
+ CI->eraseFromParent();
+}
+
+bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ bool EverMadeChange = false;
+
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ bool MadeChange = true;
+ while (MadeChange) {
+ MadeChange = false;
+ for (Function::iterator I = F.begin(); I != F.end();) {
+ BasicBlock *BB = &*I++;
+ bool ModifiedDTOnIteration = false;
+ MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
+
+ // Restart BB iteration if the dominator tree of the Function was changed
+ if (ModifiedDTOnIteration)
+ break;
+ }
+
+ EverMadeChange |= MadeChange;
+ }
+
+ return EverMadeChange;
+}
+
+bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
+ bool MadeChange = false;
+
+ BasicBlock::iterator CurInstIterator = BB.begin();
+ while (CurInstIterator != BB.end()) {
+ if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++))
+ MadeChange |= optimizeCallInst(CI, ModifiedDT);
+ if (ModifiedDT)
+ return true;
+ }
+
+ return MadeChange;
+}
+
+bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
+ bool &ModifiedDT) {
+
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+ if (II) {
+ switch (II->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::masked_load: {
+ // Scalarize unsupported vector masked load
+ if (!TTI->isLegalMaskedLoad(CI->getType())) {
+ scalarizeMaskedLoad(CI);
+ ModifiedDT = true;
+ return true;
+ }
+ return false;
+ }
+ case Intrinsic::masked_store: {
+ if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
+ scalarizeMaskedStore(CI);
+ ModifiedDT = true;
+ return true;
+ }
+ return false;
+ }
+ case Intrinsic::masked_gather: {
+ if (!TTI->isLegalMaskedGather(CI->getType())) {
+ scalarizeMaskedGather(CI);
+ ModifiedDT = true;
+ return true;
+ }
+ return false;
+ }
+ case Intrinsic::masked_scatter: {
+ if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
+ scalarizeMaskedScatter(CI);
+ ModifiedDT = true;
+ return true;
+ }
+ return false;
+ }
+ }
+ }
+
+ return false;
+}
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c77046fdfaf5..caf5cb497a71 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -114,7 +114,7 @@ namespace {
SmallPtrSet<SDNode *, 32> CombinedNodes;
// AA - Used for DAG load/store alias analysis.
- AliasAnalysis &AA;
+ AliasAnalysis *AA;
/// When an instruction is simplified, add all users of the instruction to
/// the work lists because they might get more simplified now.
@@ -496,9 +496,9 @@ namespace {
SDValue distributeTruncateThroughAnd(SDNode *N);
public:
- DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
+ DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
: DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
- OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
+ OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) {
ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
MaximumLegalStoreInBits = 0;
@@ -1729,10 +1729,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
NumLeftToConsider--;
}
- SDValue Result;
-
// If we've changed things around then replace token factor.
if (Changed) {
+ SDValue Result;
if (Ops.empty()) {
// The entry token is the only possible outcome.
Result = DAG.getEntryNode();
@@ -1749,13 +1748,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
}
}
-
- // Add users to worklist, since we may introduce a lot of new
- // chained token factors while removing memory deps.
- return CombineTo(N, Result, true /*add to worklist*/);
+ return Result;
}
-
- return Result;
+ return SDValue();
}
/// MERGE_VALUES can always be eliminated.
@@ -2131,17 +2126,17 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CarryIn = N->getOperand(2);
+ SDLoc DL(N);
// canonicalize constant to RHS
ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
if (N0C && !N1C)
- return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
- N1, N0, CarryIn);
+ return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
// fold (addcarry x, y, false) -> (uaddo x, y)
if (isNullConstant(CarryIn))
- return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(), N0, N1);
+ return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
return Combined;
@@ -5313,17 +5308,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
}
}
- // If the target supports masking y in (shl, y),
- // fold (shl x, (and y, ((1 << numbits(x)) - 1))) -> (shl x, y)
- if (TLI.isOperationLegal(ISD::SHL, VT) &&
- TLI.supportsModuloShift(ISD::SHL, VT) && N1->getOpcode() == ISD::AND) {
- if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
- if (Mask->getZExtValue() == OpSizeInBits - 1) {
- return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1->getOperand(0));
- }
- }
- }
-
ConstantSDNode *N1C = isConstOrConstSplat(N1);
// fold (shl c1, c2) -> c1<<c2
@@ -5331,7 +5315,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
if (N0C && N1C && !N1C->isOpaque())
return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
// fold (shl 0, x) -> 0
- if (isNullConstant(N0))
+ if (isNullConstantOrNullSplatConstant(N0))
return N0;
// fold (shl x, c >= size(x)) -> undef
if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
@@ -5522,18 +5506,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
- // If the target supports masking y in (sra, y),
- // fold (sra x, (and y, ((1 << numbits(x)) - 1))) -> (sra x, y)
- if (TLI.isOperationLegal(ISD::SRA, VT) &&
- TLI.supportsModuloShift(ISD::SRA, VT) && N1->getOpcode() == ISD::AND) {
- if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
- if (Mask->getZExtValue() == OpSizeInBits - 1) {
- return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, N1->getOperand(0));
- }
- }
- }
-
// Arithmetic shifting an all-sign-bit value is a no-op.
+ // fold (sra 0, x) -> 0
+ // fold (sra -1, x) -> -1
if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
return N0;
@@ -5548,12 +5523,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
if (N0C && N1C && !N1C->isOpaque())
return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
- // fold (sra 0, x) -> 0
- if (isNullConstant(N0))
- return N0;
- // fold (sra -1, x) -> -1
- if (isAllOnesConstant(N0))
- return N0;
// fold (sra x, c >= size(x)) -> undef
if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
return DAG.getUNDEF(VT);
@@ -5691,17 +5660,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
EVT VT = N0.getValueType();
unsigned OpSizeInBits = VT.getScalarSizeInBits();
- // If the target supports masking y in (srl, y),
- // fold (srl x, (and y, ((1 << numbits(x)) - 1))) -> (srl x, y)
- if (TLI.isOperationLegal(ISD::SRL, VT) &&
- TLI.supportsModuloShift(ISD::SRL, VT) && N1->getOpcode() == ISD::AND) {
- if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) {
- if (Mask->getZExtValue() == OpSizeInBits - 1) {
- return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1->getOperand(0));
- }
- }
- }
-
// fold vector ops
if (VT.isVector())
if (SDValue FoldedVOp = SimplifyVBinOp(N))
@@ -5714,7 +5672,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
if (N0C && N1C && !N1C->isOpaque())
return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
// fold (srl 0, x) -> 0
- if (isNullConstant(N0))
+ if (isNullConstantOrNullSplatConstant(N0))
return N0;
// fold (srl x, c >= size(x)) -> undef
if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
@@ -7365,14 +7323,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
N0.getValueSizeInBits(),
std::min(Op.getValueSizeInBits(),
VT.getSizeInBits()));
- if (TruncatedBits.isSubsetOf(Known.Zero)) {
- if (VT.bitsGT(Op.getValueType()))
- return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op);
- if (VT.bitsLT(Op.getValueType()))
- return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-
- return Op;
- }
+ if (TruncatedBits.isSubsetOf(Known.Zero))
+ return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
}
// fold (zext (truncate (load x))) -> (zext (smaller load x))
@@ -7419,14 +7371,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
}
if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
- SDValue Op = N0.getOperand(0);
- if (SrcVT.bitsLT(VT)) {
- Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
- AddToWorklist(Op.getNode());
- } else if (SrcVT.bitsGT(VT)) {
- Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
- AddToWorklist(Op.getNode());
- }
+ SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
+ AddToWorklist(Op.getNode());
return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
}
}
@@ -7440,11 +7386,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
N0.getValueType()) ||
!TLI.isZExtFree(N0.getValueType(), VT))) {
SDValue X = N0.getOperand(0).getOperand(0);
- if (X.getValueType().bitsLT(VT)) {
- X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X);
- } else if (X.getValueType().bitsGT(VT)) {
- X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
- }
+ X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
Mask = Mask.zext(VT.getSizeInBits());
SDLoc DL(N);
@@ -7669,14 +7611,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
}
// fold (aext (truncate x))
- if (N0.getOpcode() == ISD::TRUNCATE) {
- SDValue TruncOp = N0.getOperand(0);
- if (TruncOp.getValueType() == VT)
- return TruncOp; // x iff x size == zext size.
- if (TruncOp.getValueType().bitsGT(VT))
- return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp);
- return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp);
- }
+ if (N0.getOpcode() == ISD::TRUNCATE)
+ return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
// Fold (aext (and (trunc x), cst)) -> (and x, cst)
// if the trunc is not free.
@@ -7687,11 +7623,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
N0.getValueType())) {
SDLoc DL(N);
SDValue X = N0.getOperand(0).getOperand(0);
- if (X.getValueType().bitsLT(VT)) {
- X = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
- } else if (X.getValueType().bitsGT(VT)) {
- X = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
- }
+ X = DAG.getAnyExtOrTrunc(X, DL, VT);
APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
Mask = Mask.zext(VT.getSizeInBits());
return DAG.getNode(ISD::AND, DL, VT,
@@ -14868,6 +14800,55 @@ SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) {
return SDValue();
}
+// Combine shuffles of splat-shuffles of the form:
+// shuffle (shuffle V, undef, splat-mask), undef, M
+// If splat-mask contains undef elements, we need to be careful about
+// introducing undef's in the folded mask which are not the result of composing
+// the masks of the shuffles.
+static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask,
+ ShuffleVectorSDNode *Splat,
+ SelectionDAG &DAG) {
+ ArrayRef<int> SplatMask = Splat->getMask();
+ assert(UserMask.size() == SplatMask.size() && "Mask length mismatch");
+
+ // Prefer simplifying to the splat-shuffle, if possible. This is legal if
+ // every undef mask element in the splat-shuffle has a corresponding undef
+ // element in the user-shuffle's mask or if the composition of mask elements
+ // would result in undef.
+ // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
+ // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
+ // In this case it is not legal to simplify to the splat-shuffle because we
+ // may be exposing the users of the shuffle an undef element at index 1
+ // which was not there before the combine.
+ // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
+ // In this case the composition of masks yields SplatMask, so it's ok to
+ // simplify to the splat-shuffle.
+ // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
+ // In this case the composed mask includes all undef elements of SplatMask
+ // and in addition sets element zero to undef. It is safe to simplify to
+ // the splat-shuffle.
+ auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
+ ArrayRef<int> SplatMask) {
+ for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
+ if (UserMask[i] != -1 && SplatMask[i] == -1 &&
+ SplatMask[UserMask[i]] != -1)
+ return false;
+ return true;
+ };
+ if (CanSimplifyToExistingSplat(UserMask, SplatMask))
+ return SDValue(Splat, 0);
+
+ // Create a new shuffle with a mask that is composed of the two shuffles'
+ // masks.
+ SmallVector<int, 32> NewMask;
+ for (int Idx : UserMask)
+ NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);
+
+ return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
+ Splat->getOperand(0), Splat->getOperand(1),
+ NewMask);
+}
+
SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
EVT VT = N->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
@@ -14914,6 +14895,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
}
+ // A shuffle of a single vector that is a splat can always be folded.
+ if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))
+ if (N1->isUndef() && N0Shuf->isSplat())
+ return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG);
+
// If it is a splat, check if the argument vector is another splat or a
// build_vector.
if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
@@ -16381,17 +16367,17 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
UseAA = false;
#endif
- if (UseAA &&
+ if (UseAA && AA &&
Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) {
// Use alias analysis information.
int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset;
int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset;
AliasResult AAResult =
- AA.alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
- UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
- MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
- UseTBAA ? Op1->getAAInfo() : AAMDNodes()));
+ AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
+ UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
+ MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
+ UseTBAA ? Op1->getAAInfo() : AAMDNodes()) );
if (AAResult == NoAlias)
return false;
}
@@ -16605,7 +16591,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
}
/// This is the entry point for the file.
-void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
+void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
CodeGenOpt::Level OptLevel) {
/// This is the main entry point to this class.
DAGCombiner(*this, AA, OptLevel).Run(Level);
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 8c98e3740f6d..5003b79974eb 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -622,7 +622,7 @@ bool FastISel::selectStackmap(const CallInst *I) {
// have to worry about calling conventions and target-specific lowering code.
// Instead we perform the call lowering right here.
//
- // CALLSEQ_START(0...)
+ // CALLSEQ_START(0, 0...)
// STACKMAP(id, nbytes, ...)
// CALLSEQ_END(0, 0)
//
@@ -1150,16 +1150,16 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
return true;
}
- unsigned Offset = 0;
+ // Byval arguments with frame indices were already handled after argument
+ // lowering and before isel.
+ const auto *Arg =
+ dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
+ if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
+ return true;
+
Optional<MachineOperand> Op;
- if (const auto *Arg = dyn_cast<Argument>(Address))
- // Some arguments' frame index is recorded during argument lowering.
- Offset = FuncInfo.getArgumentFrameIndex(Arg);
- if (Offset)
- Op = MachineOperand::CreateFI(Offset);
- if (!Op)
- if (unsigned Reg = lookUpRegForValue(Address))
- Op = MachineOperand::CreateReg(Reg, false);
+ if (unsigned Reg = lookUpRegForValue(Address))
+ Op = MachineOperand::CreateReg(Reg, false);
// If we have a VLA that has a "use" in a metadata node that's then used
// here but it has no other uses, then we have a problem. E.g.,
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index cdf4d3a8b4e5..606b8952f3c1 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -85,7 +85,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
MF = &mf;
TLI = MF->getSubtarget().getTargetLowering();
RegInfo = &MF->getRegInfo();
- MachineModuleInfo &MMI = MF->getMMI();
const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
unsigned StackAlign = TFI->getStackAlignment();
@@ -214,33 +213,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(&I)))
InitializeRegForValue(&I);
- // Collect llvm.dbg.declare information. This is done now instead of
- // during the initial isel pass through the IR so that it is done
- // in a predictable order.
- if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I)) {
- assert(DI->getVariable() && "Missing variable");
- assert(DI->getDebugLoc() && "Missing location");
- if (MMI.hasDebugInfo()) {
- // Don't handle byval struct arguments or VLAs, for example.
- // Non-byval arguments are handled here (they refer to the stack
- // temporary alloca at this point).
- const Value *Address = DI->getAddress();
- if (Address) {
- if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
- Address = BCI->getOperand(0);
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
- DenseMap<const AllocaInst *, int>::iterator SI =
- StaticAllocaMap.find(AI);
- if (SI != StaticAllocaMap.end()) { // Check for VLAs.
- int FI = SI->second;
- MF->setVariableDbgInfo(DI->getVariable(), DI->getExpression(),
- FI, DI->getDebugLoc());
- }
- }
- }
- }
- }
-
// Decide the preferred extend type for a value.
PreferredExtendType[&I] = getPreferredExtendForValue(&I);
}
@@ -510,12 +482,11 @@ void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A,
/// If the argument does not have any assigned frame index then 0 is
/// returned.
int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
- DenseMap<const Argument *, int>::iterator I =
- ByValArgFrameIndexMap.find(A);
+ auto I = ByValArgFrameIndexMap.find(A);
if (I != ByValArgFrameIndexMap.end())
return I->second;
DEBUG(dbgs() << "Argument does not have assigned frame index!\n");
- return 0;
+ return INT_MAX;
}
unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2654b3ad7a62..9a47a914df91 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1493,7 +1493,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
SDValue Size = Tmp2.getOperand(1);
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
@@ -4187,6 +4187,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
ReplacedNode(Node);
break;
}
+ case ISD::MUL:
case ISD::SDIV:
case ISD::SREM:
case ISD::UDIV:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cde4331cc42d..4c3b514856b7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -675,6 +675,7 @@ private:
// Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
bool SplitVectorOperand(SDNode *N, unsigned OpNo);
SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
+ SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo);
SDValue SplitVecOp_UnaryOp(SDNode *N);
SDValue SplitVecOp_TruncateHelper(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 97a7fab6efd0..ff0e609803d8 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1513,6 +1513,22 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::ZERO_EXTEND_VECTOR_INREG:
Res = SplitVecOp_ExtVecInRegOp(N);
break;
+
+ case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMIN:
+ Res = SplitVecOp_VECREDUCE(N, OpNo);
+ break;
}
}
@@ -1565,6 +1581,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
}
+SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
+ EVT ResVT = N->getValueType(0);
+ SDValue Lo, Hi;
+ SDLoc dl(N);
+
+ SDValue VecOp = N->getOperand(OpNo);
+ EVT VecVT = VecOp.getValueType();
+ assert(VecVT.isVector() && "Can only split reduce vector operand");
+ GetSplitVector(VecOp, Lo, Hi);
+ EVT LoOpVT, HiOpVT;
+ std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
+
+ bool NoNaN = N->getFlags().hasNoNaNs();
+ unsigned CombineOpc = 0;
+ switch (N->getOpcode()) {
+ case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
+ case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
+ case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break;
+ case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break;
+ case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break;
+ case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break;
+ case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break;
+ case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
+ case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
+ case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
+ case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
+ case ISD::VECREDUCE_FMAX:
+ CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
+ break;
+ case ISD::VECREDUCE_FMIN:
+ CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
+ break;
+ default:
+ llvm_unreachable("Unexpected reduce ISD node");
+ }
+
+ // Use the appropriate scalar instruction on the split subvectors before
+ // reducing the now partially reduced smaller vector.
+ SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi);
+ return DAG.getNode(N->getOpcode(), dl, ResVT, Partial);
+}
+
SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
// The result has a legal vector type, but the input needs splitting.
EVT ResVT = N->getValueType(0);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d605a1dc1c20..057badcd6b74 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2217,10 +2217,10 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// Also compute a conservative estimate for high known-0 bits.
// More trickiness is possible, but this is sufficient for the
// interesting case of alignment computation.
- unsigned TrailZ = Known.Zero.countTrailingOnes() +
- Known2.Zero.countTrailingOnes();
- unsigned LeadZ = std::max(Known.Zero.countLeadingOnes() +
- Known2.Zero.countLeadingOnes(),
+ unsigned TrailZ = Known.countMinTrailingZeros() +
+ Known2.countMinTrailingZeros();
+ unsigned LeadZ = std::max(Known.countMinLeadingZeros() +
+ Known2.countMinLeadingZeros(),
BitWidth) - BitWidth;
Known.resetAll();
@@ -2233,13 +2233,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// treat a udiv as a logical right shift by the power of 2 known to
// be less than the denominator.
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
- unsigned LeadZ = Known2.Zero.countLeadingOnes();
+ unsigned LeadZ = Known2.countMinLeadingZeros();
computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
- unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros();
- if (RHSUnknownLeadingOnes != BitWidth)
- LeadZ = std::min(BitWidth,
- LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+ unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+ if (RHSMaxLeadingZeros != BitWidth)
+ LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
Known.Zero.setHighBits(LeadZ);
break;
@@ -2359,7 +2358,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
case ISD::CTTZ_ZERO_UNDEF: {
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
// If we have a known 1, its position is our upper bound.
- unsigned PossibleTZ = Known2.One.countTrailingZeros();
+ unsigned PossibleTZ = Known2.countMaxTrailingZeros();
unsigned LowBits = Log2_32(PossibleTZ) + 1;
Known.Zero.setBitsFrom(LowBits);
break;
@@ -2368,7 +2367,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
case ISD::CTLZ_ZERO_UNDEF: {
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
// If we have a known 1, its position is our upper bound.
- unsigned PossibleLZ = Known2.One.countLeadingZeros();
+ unsigned PossibleLZ = Known2.countMaxLeadingZeros();
unsigned LowBits = Log2_32(PossibleLZ) + 1;
Known.Zero.setBitsFrom(LowBits);
break;
@@ -2376,7 +2375,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
case ISD::CTPOP: {
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
// If we know some of the bits are zero, they can't be one.
- unsigned PossibleOnes = BitWidth - Known2.Zero.countPopulation();
+ unsigned PossibleOnes = Known2.countMaxPopulation();
Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
break;
}
@@ -2493,13 +2492,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// going to be 0 in the result. Both addition and complement operations
// preserve the low zero bits.
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
- unsigned KnownZeroLow = Known2.Zero.countTrailingOnes();
+ unsigned KnownZeroLow = Known2.countMinTrailingZeros();
if (KnownZeroLow == 0)
break;
computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
- KnownZeroLow = std::min(KnownZeroLow,
- Known2.Zero.countTrailingOnes());
+ KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
Known.Zero.setLowBits(KnownZeroLow);
break;
}
@@ -2526,15 +2524,13 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// and the other has the top 8 bits clear, we know the top 7 bits of the
// output must be clear.
computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
- unsigned KnownZeroHigh = Known2.Zero.countLeadingOnes();
- unsigned KnownZeroLow = Known2.Zero.countTrailingOnes();
+ unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
+ unsigned KnownZeroLow = Known2.countMinTrailingZeros();
computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
Depth + 1);
- KnownZeroHigh = std::min(KnownZeroHigh,
- Known2.Zero.countLeadingOnes());
- KnownZeroLow = std::min(KnownZeroLow,
- Known2.Zero.countTrailingOnes());
+ KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
+ KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
if (Opcode == ISD::ADDE || Opcode == ISD::ADDCARRY) {
// With ADDE and ADDCARRY, a carry bit may be added in, so we can only
@@ -2594,8 +2590,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
- uint32_t Leaders = std::max(Known.Zero.countLeadingOnes(),
- Known2.Zero.countLeadingOnes());
+ uint32_t Leaders =
+ std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
Known.resetAll();
Known.Zero.setHighBits(Leaders);
break;
@@ -2711,8 +2707,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// UMIN - we know that the result will have the maximum of the
// known zero leading bits of the inputs.
- unsigned LeadZero = Known.Zero.countLeadingOnes();
- LeadZero = std::max(LeadZero, Known2.Zero.countLeadingOnes());
+ unsigned LeadZero = Known.countMinLeadingZeros();
+ LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());
Known.Zero &= Known2.Zero;
Known.One &= Known2.One;
@@ -2726,8 +2722,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
// UMAX - we know that the result will have the maximum of the
// known one leading bits of the inputs.
- unsigned LeadOne = Known.One.countLeadingOnes();
- LeadOne = std::max(LeadOne, Known2.One.countLeadingOnes());
+ unsigned LeadOne = Known.countMinLeadingOnes();
+ LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());
Known.Zero &= Known2.Zero;
Known.One &= Known2.One;
@@ -2843,8 +2839,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
// Fall back to computeKnownBits to catch other known cases.
KnownBits Known;
computeKnownBits(Val, Known);
- return (Known.Zero.countPopulation() == BitWidth - 1) &&
- (Known.One.countPopulation() == 1);
+ return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
}
unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
@@ -2860,6 +2855,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
EVT VT = Op.getValueType();
assert(VT.isInteger() && "Invalid VT!");
unsigned VTBits = VT.getScalarSizeInBits();
+ unsigned NumElts = DemandedElts.getBitWidth();
unsigned Tmp, Tmp2;
unsigned FirstAnswer = 1;
@@ -2903,6 +2899,39 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
}
return Tmp;
+ case ISD::VECTOR_SHUFFLE: {
+ // Collect the minimum number of sign bits that are shared by every vector
+ // element referenced by the shuffle.
+ APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+ const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+ assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int M = SVN->getMaskElt(i);
+ if (!DemandedElts[i])
+ continue;
+ // For UNDEF elements, we don't know anything about the common state of
+ // the shuffle result.
+ if (M < 0)
+ return 1;
+ if ((unsigned)M < NumElts)
+ DemandedLHS.setBit((unsigned)M % NumElts);
+ else
+ DemandedRHS.setBit((unsigned)M % NumElts);
+ }
+ Tmp = UINT_MAX;
+ if (!!DemandedLHS)
+ Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+ if (!!DemandedRHS) {
+ Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
+ Tmp = std::min(Tmp, Tmp2);
+ }
+ // If we don't know anything, early out and try computeKnownBits fall-back.
+ if (Tmp == 1)
+ break;
+ assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+ return Tmp;
+ }
+
case ISD::SIGN_EXTEND:
case ISD::SIGN_EXTEND_VECTOR_INREG:
Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
@@ -3142,14 +3171,36 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
}
- case ISD::EXTRACT_SUBVECTOR:
- return ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ case ISD::EXTRACT_SUBVECTOR: {
+ // If we know the element index, just demand that subvector elements,
+ // otherwise demand them all.
+ SDValue Src = Op.getOperand(0);
+ ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+ if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+ // Offset the demanded elts by the subvector index.
+ uint64_t Idx = SubIdx->getZExtValue();
+ APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
+ return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
+ }
+ return ComputeNumSignBits(Src, Depth + 1);
+ }
case ISD::CONCAT_VECTORS:
- // Determine the minimum number of sign bits across all input vectors.
- // Early out if the result is already 1.
- Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
- for (unsigned i = 1, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i)
- Tmp = std::min(Tmp, ComputeNumSignBits(Op.getOperand(i), Depth + 1));
+ // Determine the minimum number of sign bits across all demanded
+ // elts of the input vectors. Early out if the result is already 1.
+ Tmp = UINT_MAX;
+ EVT SubVectorVT = Op.getOperand(0).getValueType();
+ unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
+ unsigned NumSubVectors = Op.getNumOperands();
+ for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
+ APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
+ DemandedSub = DemandedSub.trunc(NumSubVectorElts);
+ if (!DemandedSub)
+ continue;
+ Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
+ Tmp = std::min(Tmp, Tmp2);
+ }
+ assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
return Tmp;
}
@@ -3543,7 +3594,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
assert(Operand.getValueType().bitsLT(VT) &&
"Invalid sext node, dst < src!");
if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
- return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+ return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
else if (OpOpcode == ISD::UNDEF)
// sext(undef) = 0, because the top bits will all be the same.
return getConstant(0, DL, VT);
@@ -3559,8 +3610,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
assert(Operand.getValueType().bitsLT(VT) &&
"Invalid zext node, dst < src!");
if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x)
- return getNode(ISD::ZERO_EXTEND, DL, VT,
- Operand.getNode()->getOperand(0));
+ return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
else if (OpOpcode == ISD::UNDEF)
// zext(undef) = 0, because the top bits will be zero.
return getConstant(0, DL, VT);
@@ -3579,13 +3629,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
OpOpcode == ISD::ANY_EXTEND)
// (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x)
- return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+ return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
else if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
// (ext (trunx x)) -> x
if (OpOpcode == ISD::TRUNCATE) {
- SDValue OpOp = Operand.getNode()->getOperand(0);
+ SDValue OpOp = Operand.getOperand(0);
if (OpOp.getValueType() == VT)
return OpOp;
}
@@ -3601,16 +3651,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
assert(Operand.getValueType().bitsGT(VT) &&
"Invalid truncate node, src < dst!");
if (OpOpcode == ISD::TRUNCATE)
- return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+ return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
OpOpcode == ISD::ANY_EXTEND) {
// If the source is smaller than the dest, we still need an extend.
- if (Operand.getNode()->getOperand(0).getValueType().getScalarType()
+ if (Operand.getOperand(0).getValueType().getScalarType()
.bitsLT(VT.getScalarType()))
- return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
- if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
- return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
- return Operand.getNode()->getOperand(0);
+ return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
+ if (Operand.getOperand(0).getValueType().bitsGT(VT))
+ return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
+ return Operand.getOperand(0);
}
if (OpOpcode == ISD::UNDEF)
return getUNDEF(VT);
@@ -3665,15 +3715,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
// -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
// FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags?
- return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1),
- Operand.getNode()->getOperand(0),
- Operand.getNode()->getFlags());
+ return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
+ Operand.getOperand(0), Operand.getNode()->getFlags());
if (OpOpcode == ISD::FNEG) // --X -> X
- return Operand.getNode()->getOperand(0);
+ return Operand.getOperand(0);
break;
case ISD::FABS:
if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X)
- return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0));
+ return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
break;
}
@@ -5970,7 +6019,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
unsigned NumOps = Ops.size();
switch (NumOps) {
case 0: return getNode(Opcode, DL, VT);
- case 1: return getNode(Opcode, DL, VT, Ops[0]);
+ case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
default: break;
@@ -7520,9 +7569,8 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
KnownBits Known(PtrWidth);
- llvm::computeKnownBits(const_cast<GlobalValue *>(GV), Known,
- getDataLayout());
- unsigned AlignBits = Known.Zero.countTrailingOnes();
+ llvm::computeKnownBits(GV, Known, getDataLayout());
+ unsigned AlignBits = Known.countMinTrailingZeros();
unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
if (Align)
return MinAlign(Align, GVOffset);
@@ -7621,7 +7669,7 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
return false;
// FIXME: The widths are based on this node's type, but build vectors can
- // truncate their operands.
+ // truncate their operands.
SplatValue = APInt(VecWidth, 0);
SplatUndef = APInt(VecWidth, 0);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 50313e2da884..57d340c41c39 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -661,7 +661,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
unsigned RegSize = RegisterVT.getSizeInBits();
unsigned NumSignBits = LOI->NumSignBits;
- unsigned NumZeroBits = LOI->Known.Zero.countLeadingOnes();
+ unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();
if (NumZeroBits == RegSize) {
// The current value is a zero.
@@ -811,9 +811,9 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
}
}
-void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa,
const TargetLibraryInfo *li) {
- AA = &aa;
+ AA = aa;
GFI = gfi;
LibInfo = li;
DL = &DAG.getDataLayout();
@@ -3423,7 +3423,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (isVolatile || NumValues > MaxParallelChains)
// Serialize volatile loads with other side effects.
Root = getRoot();
- else if (AA->pointsToConstantMemory(MemoryLocation(
+ else if (AA && AA->pointsToConstantMemory(MemoryLocation(
SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) {
// Do not serialize (non-volatile) loads of constant memory with anything.
Root = DAG.getEntryNode();
@@ -3535,8 +3535,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
Type *Ty = I.getType();
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
- assert(!AA->pointsToConstantMemory(MemoryLocation(
- SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo)) &&
+ assert((!AA || !AA->pointsToConstantMemory(MemoryLocation(
+ SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) &&
"load_from_swift_error should not be constant memory");
SmallVector<EVT, 4> ValueVTs;
@@ -3817,7 +3817,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
// Do not serialize masked loads of constant memory with anything.
- bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation(
+ bool AddToChain = !AA || !AA->pointsToConstantMemory(MemoryLocation(
PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo));
SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
@@ -3861,7 +3861,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
bool ConstantMemory = false;
if (UniformBase &&
- AA->pointsToConstantMemory(MemoryLocation(
+ AA && AA->pointsToConstantMemory(MemoryLocation(
BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()),
AAInfo))) {
// Do not serialize (non-volatile) loads of constant memory with anything.
@@ -4676,7 +4676,8 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
bool IsIndirect = false;
Optional<MachineOperand> Op;
// Some arguments' frame index is recorded during argument lowering.
- if (int FI = FuncInfo.getArgumentFrameIndex(Arg))
+ int FI = FuncInfo.getArgumentFrameIndex(Arg);
+ if (FI != INT_MAX)
Op = MachineOperand::CreateFI(FI);
if (!Op && N.getNode()) {
@@ -4927,6 +4928,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
return nullptr;
}
+ // Byval arguments with frame indices were already handled after argument
+ // lowering and before isel.
+ const auto *Arg =
+ dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
+ if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
+ return nullptr;
+
SDValue &N = NodeMap[Address];
if (!N.getNode() && isa<Argument>(Address))
// Check unused arguments map.
@@ -4957,20 +4965,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
// virtual register info from the FuncInfo.ValueMap.
if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true,
N)) {
- // If variable is pinned by a alloca in dominating bb then
- // use StaticAllocaMap.
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
- if (AI->getParent() != DI.getParent()) {
- DenseMap<const AllocaInst*, int>::iterator SI =
- FuncInfo.StaticAllocaMap.find(AI);
- if (SI != FuncInfo.StaticAllocaMap.end()) {
- SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second,
- 0, dl, SDNodeOrder);
- DAG.AddDbgValue(SDV, nullptr, false);
- return nullptr;
- }
- }
- }
DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
}
}
@@ -5651,7 +5645,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
int FI = FuncInfo.StaticAllocaMap[Slot];
MCSymbol *FrameAllocSym =
MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
- GlobalValue::getRealLinkageName(MF.getName()), Idx);
+ GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
TII->get(TargetOpcode::LOCAL_ESCAPE))
.addSym(FrameAllocSym)
@@ -5672,7 +5666,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
unsigned IdxVal = unsigned(Idx->getLimitedValue(INT_MAX));
MCSymbol *FrameAllocSym =
MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
- GlobalValue::getRealLinkageName(Fn->getName()), IdxVal);
+ GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
// Create a MCSymbol for the label to avoid any target lowering
// that would make this PC relative.
@@ -5737,6 +5731,24 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
case Intrinsic::experimental_deoptimize:
LowerDeoptimizeCall(&I);
return nullptr;
+
+ case Intrinsic::experimental_vector_reduce_fadd:
+ case Intrinsic::experimental_vector_reduce_fmul:
+ case Intrinsic::experimental_vector_reduce_add:
+ case Intrinsic::experimental_vector_reduce_mul:
+ case Intrinsic::experimental_vector_reduce_and:
+ case Intrinsic::experimental_vector_reduce_or:
+ case Intrinsic::experimental_vector_reduce_xor:
+ case Intrinsic::experimental_vector_reduce_smax:
+ case Intrinsic::experimental_vector_reduce_smin:
+ case Intrinsic::experimental_vector_reduce_umax:
+ case Intrinsic::experimental_vector_reduce_umin:
+ case Intrinsic::experimental_vector_reduce_fmax:
+ case Intrinsic::experimental_vector_reduce_fmin: {
+ visitVectorReduce(I, Intrinsic);
+ return nullptr;
+ }
+
}
}
@@ -5982,7 +5994,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
bool ConstantMemory = false;
// Do not serialize (non-volatile) loads of constant memory with anything.
- if (Builder.AA->pointsToConstantMemory(PtrVal)) {
+ if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) {
Root = Builder.DAG.getEntryNode();
ConstantMemory = true;
} else {
@@ -7422,11 +7434,11 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
// have to worry about calling conventions and target specific lowering code.
// Instead we perform the call lowering right here.
//
- // chain, flag = CALLSEQ_START(chain, 0)
+ // chain, flag = CALLSEQ_START(chain, 0, 0)
// chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
// chain, flag = CALLSEQ_END(chain, 0, 0, flag)
//
- Chain = DAG.getCALLSEQ_START(getRoot(), NullPtr, DL);
+ Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL);
InFlag = Chain.getValue(1);
// Add the <id> and <numBytes> constants.
@@ -7616,6 +7628,76 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
FuncInfo.MF->getFrameInfo().setHasPatchPoint();
}
+void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
+ unsigned Intrinsic) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Op1 = getValue(I.getArgOperand(0));
+ SDValue Op2;
+ if (I.getNumArgOperands() > 1)
+ Op2 = getValue(I.getArgOperand(1));
+ SDLoc dl = getCurSDLoc();
+ EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+ SDValue Res;
+ FastMathFlags FMF;
+ if (isa<FPMathOperator>(I))
+ FMF = I.getFastMathFlags();
+ SDNodeFlags SDFlags;
+ SDFlags.setNoNaNs(FMF.noNaNs());
+
+ switch (Intrinsic) {
+ case Intrinsic::experimental_vector_reduce_fadd:
+ if (FMF.unsafeAlgebra())
+ Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
+ else
+ Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
+ break;
+ case Intrinsic::experimental_vector_reduce_fmul:
+ if (FMF.unsafeAlgebra())
+ Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
+ else
+ Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
+ break;
+ case Intrinsic::experimental_vector_reduce_add:
+ Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_mul:
+ Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_and:
+ Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_or:
+ Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_xor:
+ Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_smax:
+ Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_smin:
+ Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_umax:
+ Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_umin:
+ Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_fmax: {
+ Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
+ break;
+ }
+ case Intrinsic::experimental_vector_reduce_fmin: {
+ Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
+ break;
+ }
+ default:
+ llvm_unreachable("Unhandled vector reduce intrinsic");
+ }
+ setValue(&I, Res);
+}
+
/// Returns an AttributeList representing the attributes applied to the return
/// value of the given call.
static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 9e9989058ae5..bdaee858da61 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -604,11 +604,11 @@ public:
SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
CodeGenOpt::Level ol)
: CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()),
- DAG(dag), FuncInfo(funcinfo),
+ DAG(dag), DL(nullptr), AA(nullptr), FuncInfo(funcinfo),
HasTailCall(false) {
}
- void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+ void init(GCFunctionInfo *gfi, AliasAnalysis *AA,
const TargetLibraryInfo *li);
/// Clear out the current SelectionDAG and the associated state and prepare
@@ -909,6 +909,8 @@ private:
void visitGCRelocate(const GCRelocateInst &I);
void visitGCResult(const GCResultInst &I);
+ void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
+
void visitUserOp1(const Instruction &I) {
llvm_unreachable("UserOp1 should not exist at instruction selection time!");
}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 26dd45ef933f..c37d7080f2c5 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -346,6 +346,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::SETFALSE: return "setfalse";
case ISD::SETFALSE2: return "setfalse2";
}
+ case ISD::VECREDUCE_FADD: return "vecreduce_fadd";
+ case ISD::VECREDUCE_FMUL: return "vecreduce_fmul";
+ case ISD::VECREDUCE_ADD: return "vecreduce_add";
+ case ISD::VECREDUCE_MUL: return "vecreduce_mul";
+ case ISD::VECREDUCE_AND: return "vecreduce_and";
+ case ISD::VECREDUCE_OR: return "vecreduce_or";
+ case ISD::VECREDUCE_XOR: return "vecreduce_xor";
+ case ISD::VECREDUCE_SMAX: return "vecreduce_smax";
+ case ISD::VECREDUCE_SMIN: return "vecreduce_smin";
+ case ISD::VECREDUCE_UMAX: return "vecreduce_umax";
+ case ISD::VECREDUCE_UMIN: return "vecreduce_umin";
+ case ISD::VECREDUCE_FMAX: return "vecreduce_fmax";
+ case ISD::VECREDUCE_FMIN: return "vecreduce_fmin";
}
}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 3aabdaeaa094..5e0feccb6b4c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -38,6 +38,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePassRegistry.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -299,7 +300,7 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
FuncInfo(new FunctionLoweringInfo()),
CurDAG(new SelectionDAG(tm, OL)),
SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
- GFI(),
+ AA(), GFI(),
OptLevel(OL),
DAGSize(0) {
initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
@@ -317,7 +318,8 @@ SelectionDAGISel::~SelectionDAGISel() {
}
void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<AAResultsWrapperPass>();
+ if (OptLevel != CodeGenOpt::None)
+ AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<GCModuleInfo>();
AU.addRequired<StackProtector>();
AU.addPreserved<StackProtector>();
@@ -394,7 +396,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
TII = MF->getSubtarget().getInstrInfo();
TLI = MF->getSubtarget().getTargetLowering();
RegInfo = &MF->getRegInfo();
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
@@ -406,12 +407,22 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
CurDAG->init(*MF, *ORE);
FuncInfo->set(Fn, *MF, CurDAG);
+ // Now get the optional analyzes if we want to.
+ // This is based on the possibly changed OptLevel (after optnone is taken
+ // into account). That's unfortunate but OK because it just means we won't
+ // ask for passes that have been required anyway.
+
if (UseMBPI && OptLevel != CodeGenOpt::None)
FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
else
FuncInfo->BPI = nullptr;
- SDB->init(GFI, *AA, LibInfo);
+ if (OptLevel != CodeGenOpt::None)
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ else
+ AA = nullptr;
+
+ SDB->init(GFI, AA, LibInfo);
MF->setHasInlineAsm(false);
@@ -715,7 +726,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
{
NamedRegionTimer T("combine1", "DAG Combining 1", GroupName,
GroupDescription, TimePassesIsEnabled);
- CurDAG->Combine(BeforeLegalizeTypes, *AA, OptLevel);
+ CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel);
}
DEBUG(dbgs() << "Optimized lowered selection DAG: BB#" << BlockNumber
@@ -747,7 +758,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
{
NamedRegionTimer T("combine_lt", "DAG Combining after legalize types",
GroupName, GroupDescription, TimePassesIsEnabled);
- CurDAG->Combine(AfterLegalizeTypes, *AA, OptLevel);
+ CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel);
}
DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber
@@ -781,7 +792,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
{
NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors",
GroupName, GroupDescription, TimePassesIsEnabled);
- CurDAG->Combine(AfterLegalizeVectorOps, *AA, OptLevel);
+ CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel);
}
DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#"
@@ -807,7 +818,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
{
NamedRegionTimer T("combine2", "DAG Combining 2", GroupName,
GroupDescription, TimePassesIsEnabled);
- CurDAG->Combine(AfterLegalizeDAG, *AA, OptLevel);
+ CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel);
}
DEBUG(dbgs() << "Optimized legalized selection DAG: BB#" << BlockNumber
@@ -1145,6 +1156,51 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
}
}
+/// Collect llvm.dbg.declare information. This is done after argument lowering
+/// in case the declarations refer to arguments.
+static void processDbgDeclares(FunctionLoweringInfo *FuncInfo) {
+ MachineFunction *MF = FuncInfo->MF;
+ const DataLayout &DL = MF->getDataLayout();
+ for (const BasicBlock &BB : *FuncInfo->Fn) {
+ for (const Instruction &I : BB) {
+ const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I);
+ if (!DI)
+ continue;
+
+ assert(DI->getVariable() && "Missing variable");
+ assert(DI->getDebugLoc() && "Missing location");
+ const Value *Address = DI->getAddress();
+ if (!Address)
+ continue;
+
+ // Look through casts and constant offset GEPs. These mostly come from
+ // inalloca.
+ APInt Offset(DL.getPointerSizeInBits(0), 0);
+ Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+ // Check if the variable is a static alloca or a byval or inalloca
+ // argument passed in memory. If it is not, then we will ignore this
+ // intrinsic and handle this during isel like dbg.value.
+ int FI = INT_MAX;
+ if (const auto *AI = dyn_cast<AllocaInst>(Address)) {
+ auto SI = FuncInfo->StaticAllocaMap.find(AI);
+ if (SI != FuncInfo->StaticAllocaMap.end())
+ FI = SI->second;
+ } else if (const auto *Arg = dyn_cast<Argument>(Address))
+ FI = FuncInfo->getArgumentFrameIndex(Arg);
+
+ if (FI == INT_MAX)
+ continue;
+
+ DIExpression *Expr = DI->getExpression();
+ if (Offset.getBoolValue())
+ Expr = DIExpression::prepend(Expr, DIExpression::NoDeref,
+ Offset.getZExtValue());
+ MF->setVariableDbgInfo(DI->getVariable(), Expr, FI, DI->getDebugLoc());
+ }
+ }
+}
+
/// Propagate swifterror values through the machine function CFG.
static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) {
auto *TLI = FuncInfo->TLI;
@@ -1317,6 +1373,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
}
createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB);
+ processDbgDeclares(FuncInfo);
+
// Iterate over all basic blocks in the function.
for (const BasicBlock *LLVMBB : RPOT) {
if (OptLevel != CodeGenOpt::None) {
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 23f597db140c..befbd80d7965 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -417,11 +417,10 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
TLI.isZExtFree(SmallVT, Op.getValueType())) {
// We found a type with free casts.
- SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT,
- DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
- Op.getNode()->getOperand(0)),
- DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
- Op.getNode()->getOperand(1)));
+ SDValue X = DAG.getNode(
+ Op.getOpcode(), dl, SmallVT,
+ DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
+ DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1)));
bool NeedZext = DemandedSize > SmallVTBits;
SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND,
dl, Op.getValueType(), X);
@@ -817,7 +816,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
// Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
// are not demanded. This will likely allow the anyext to be folded away.
if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) {
- SDValue InnerOp = InOp.getNode()->getOperand(0);
+ SDValue InnerOp = InOp.getOperand(0);
EVT InnerVT = InnerOp.getValueType();
unsigned InnerBits = InnerVT.getSizeInBits();
if (ShAmt < InnerBits && NewMask.getActiveBits() <= InnerBits &&
diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp
index 4837495777da..2638702da152 100644
--- a/lib/CodeGen/ShrinkWrap.cpp
+++ b/lib/CodeGen/ShrinkWrap.cpp
@@ -282,8 +282,14 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
if (!Restore)
Restore = &MBB;
- else
+ else if (MPDT->getNode(&MBB)) // If the block is not in the post dom tree, it
+ // means the block never returns. If that's the
+ // case, we don't want to call
+ // `findNearestCommonDominator`, which will
+ // return `Restore`.
Restore = MPDT->findNearestCommonDominator(Restore, &MBB);
+ else
+ Restore = nullptr; // Abort, we can't find a restore point in this case.
// Make sure we would be able to insert the restore code before the
// terminator.
@@ -293,7 +299,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
continue;
// One of the terminator needs to happen before the restore point.
if (MBB.succ_empty()) {
- Restore = nullptr;
+ Restore = nullptr; // Abort, we can't find a restore point in this case.
break;
}
// Look for a restore point that post-dominates all the successors.
@@ -419,7 +425,7 @@ static bool isIrreducibleCFG(const MachineFunction &MF,
}
bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
- if (MF.empty() || !isShrinkWrapEnabled(MF))
+ if (skipFunction(*MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF))
return false;
DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index ab578df4069d..e9eff4d0acb2 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -93,8 +93,8 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
doubleUnderDataTy, // __data
VoidPtrTy, // __personality
VoidPtrTy, // __lsda
- doubleUnderJBufTy, // __jbuf
- nullptr);
+ doubleUnderJBufTy // __jbuf
+ );
return true;
}
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 34892680aceb..1d232c71d824 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -232,7 +232,11 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
if (!MD)
return nullptr;
- auto *VM = dyn_cast<ValueAsMetadata>(MD->getOperand(0));
+ const MDOperand &Op = MD->getOperand(0);
+ if (!Op.get())
+ return nullptr;
+
+ auto *VM = dyn_cast<ValueAsMetadata>(Op);
if (!VM)
report_fatal_error("MD_associated operand is not ValueAsMetadata");
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 150195f5f85b..e6c5d8753b83 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -487,6 +487,14 @@ void TargetPassConfig::addIRPasses() {
// Insert calls to mcount-like functions.
addPass(createCountingFunctionInserterPass());
+
+ // Add scalarization of target's unsupported masked memory intrinsics pass.
+ // the unsupported intrinsic will be replaced with a chain of basic blocks,
+ // that stores/loads element one-by-one if the appropriate mask bit is set.
+ addPass(createScalarizeMaskedMemIntrinPass());
+
+ // Expand reduction intrinsics into shuffle sequences if the target wants to.
+ addPass(createExpandReductionsPass());
}
/// Turn exception handling constructs into something the code generators can
@@ -607,6 +615,9 @@ void TargetPassConfig::addMachinePasses() {
addPass(&LocalStackSlotAllocationID, false);
}
+ if (getOptLevel() != CodeGenOpt::None)
+ addPass(&LiveRangeShrinkID);
+
// Run pre-ra passes.
addPreRegAlloc();
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 75359fe3c0ea..7392c8327148 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -155,7 +155,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<AAResultsWrapperPass>();
+ AU.addUsedIfAvailable<AAResultsWrapperPass>();
AU.addUsedIfAvailable<LiveVariables>();
AU.addPreserved<LiveVariables>();
AU.addPreserved<SlotIndexes>();
@@ -1627,7 +1627,10 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
InstrItins = MF->getSubtarget().getInstrItineraryData();
LV = getAnalysisIfAvailable<LiveVariables>();
LIS = getAnalysisIfAvailable<LiveIntervals>();
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ if (auto *AAPass = getAnalysisIfAvailable<AAResultsWrapperPass>())
+ AA = &AAPass->getAAResults();
+ else
+ AA = nullptr;
OptLevel = TM.getOptLevel();
bool MadeChange = false;
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index f085132b6a94..407fd9b162e9 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -206,11 +206,12 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
if (InputReg != OutputReg) {
MachineRegisterInfo &MRI = F.getRegInfo();
unsigned InputSub = Input.getSubReg();
- if (InputSub == 0) {
- MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg));
+ if (InputSub == 0 &&
+ MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg))) {
MRI.replaceRegWith(OutputReg, InputReg);
} else {
- // The input register to the PHI has a subregister:
+ // The input register to the PHI has a subregister or it can't be
+ // constrained to the proper register class:
// insert a COPY instead of simply replacing the output
// with the input.
const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo();
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index 410d5a3777d4..8d9353ae5f5e 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -13,7 +13,7 @@ add_llvm_library(LLVMDebugInfoCodeView
ModuleDebugFragmentVisitor.cpp
ModuleDebugInlineeLinesFragment.cpp
ModuleDebugLineFragment.cpp
- ModuleDebugUnknownFragment.cpp
+ RandomAccessTypeVisitor.cpp
RecordSerialization.cpp
StringTable.cpp
SymbolRecordMapping.cpp
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index 0069ee3cc904..b6ed0453d9c4 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -26,8 +26,7 @@ CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
: Callbacks(Callbacks) {}
template <typename T>
-static Error visitKnownRecord(CVTypeVisitor &Visitor, CVType &Record,
- TypeVisitorCallbacks &Callbacks) {
+static Error visitKnownRecord(CVType &Record, TypeVisitorCallbacks &Callbacks) {
TypeRecordKind RK = static_cast<TypeRecordKind>(Record.Type);
T KnownRecord(RK);
if (auto EC = Callbacks.visitKnownRecord(Record, KnownRecord))
@@ -76,7 +75,7 @@ void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) {
Handlers.push_back(&Handler);
}
-Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+Expected<bool> CVTypeVisitor::handleTypeServer(CVType &Record) {
if (Record.Type == TypeLeafKind::LF_TYPESERVER2 && !Handlers.empty()) {
auto TS = deserializeTypeServerRecord(Record);
if (!TS)
@@ -90,16 +89,16 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
// If the handler processed the record, return success.
if (*ExpectedResult)
- return Error::success();
+ return true;
// Otherwise keep searching for a handler, eventually falling out and
// using the default record handler.
}
}
+ return false;
+}
- if (auto EC = Callbacks.visitTypeBegin(Record))
- return EC;
-
+Error CVTypeVisitor::finishVisitation(CVType &Record) {
switch (Record.Type) {
default:
if (auto EC = Callbacks.visitUnknownType(Record))
@@ -107,7 +106,7 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
break;
#define TYPE_RECORD(EnumName, EnumVal, Name) \
case EnumName: { \
- if (auto EC = visitKnownRecord<Name##Record>(*this, Record, Callbacks)) \
+ if (auto EC = visitKnownRecord<Name##Record>(Record, Callbacks)) \
return EC; \
break; \
}
@@ -124,6 +123,32 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
return Error::success();
}
+Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) {
+ auto ExpectedResult = handleTypeServer(Record);
+ if (!ExpectedResult)
+ return ExpectedResult.takeError();
+ if (*ExpectedResult)
+ return Error::success();
+
+ if (auto EC = Callbacks.visitTypeBegin(Record, Index))
+ return EC;
+
+ return finishVisitation(Record);
+}
+
+Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+ auto ExpectedResult = handleTypeServer(Record);
+ if (!ExpectedResult)
+ return ExpectedResult.takeError();
+ if (*ExpectedResult)
+ return Error::success();
+
+ if (auto EC = Callbacks.visitTypeBegin(Record))
+ return EC;
+
+ return finishVisitation(Record);
+}
+
static Error visitMemberRecord(CVMemberRecord &Record,
TypeVisitorCallbacks &Callbacks) {
if (auto EC = Callbacks.visitMemberBegin(Record))
diff --git a/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp b/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp
deleted file mode 100644
index 9fd2cb8ed3e8..000000000000
--- a/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-//===- ModuleDebugUnknownFragment.cpp ---------------------------*- C++ -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h" \ No newline at end of file
diff --git a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
new file mode 100644
index 000000000000..4cb9acbe07d9
--- /dev/null
+++ b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
@@ -0,0 +1,91 @@
+//===- RandomAccessTypeVisitor.cpp ---------------------------- *- C++ --*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h"
+
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+RandomAccessTypeVisitor::RandomAccessTypeVisitor(
+ const CVTypeArray &Types, uint32_t NumRecords,
+ PartialOffsetArray PartialOffsets)
+ : Database(NumRecords), Types(Types), DatabaseVisitor(Database),
+ InternalVisitor(Pipeline), PartialOffsets(PartialOffsets) {
+ Pipeline.addCallbackToPipeline(Deserializer);
+ Pipeline.addCallbackToPipeline(DatabaseVisitor);
+
+ KnownOffsets.resize(Database.capacity());
+}
+
+Error RandomAccessTypeVisitor::visitTypeIndex(TypeIndex TI,
+ TypeVisitorCallbacks &Callbacks) {
+ assert(TI.toArrayIndex() < Database.capacity());
+
+ if (!Database.contains(TI)) {
+ if (auto EC = visitRangeForType(TI))
+ return EC;
+ }
+
+ assert(Database.contains(TI));
+ auto &Record = Database.getTypeRecord(TI);
+ CVTypeVisitor V(Callbacks);
+ return V.visitTypeRecord(Record, TI);
+}
+
+Error RandomAccessTypeVisitor::visitRangeForType(TypeIndex TI) {
+ if (PartialOffsets.empty()) {
+ TypeIndex TIB(TypeIndex::FirstNonSimpleIndex);
+ TypeIndex TIE = TIB + Database.capacity();
+ return visitRange(TIB, 0, TIE);
+ }
+
+ auto Next = std::upper_bound(PartialOffsets.begin(), PartialOffsets.end(), TI,
+ [](TypeIndex Value, const TypeIndexOffset &IO) {
+ return Value < IO.Type;
+ });
+
+ assert(Next != PartialOffsets.begin());
+ auto Prev = std::prev(Next);
+
+ TypeIndex TIB = Prev->Type;
+ TypeIndex TIE;
+ if (Next == PartialOffsets.end()) {
+ TIE = TypeIndex::fromArrayIndex(Database.capacity());
+ } else {
+ TIE = Next->Type;
+ }
+
+ if (auto EC = visitRange(TIB, Prev->Offset, TIE))
+ return EC;
+ return Error::success();
+}
+
+Error RandomAccessTypeVisitor::visitRange(TypeIndex Begin, uint32_t BeginOffset,
+ TypeIndex End) {
+
+ auto RI = Types.at(BeginOffset);
+ assert(RI != Types.end());
+
+ while (Begin != End) {
+ assert(!Database.contains(Begin));
+ if (auto EC = InternalVisitor.visitTypeRecord(*RI, Begin))
+ return EC;
+ KnownOffsets[Begin.toArrayIndex()] = BeginOffset;
+
+ BeginOffset += RI.getRecordLength();
+ ++Begin;
+ ++RI;
+ }
+
+ return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp
index 5b8841041f88..7924440e5e29 100644
--- a/lib/DebugInfo/CodeView/TypeDatabase.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabase.cpp
@@ -65,20 +65,32 @@ static const SimpleTypeEntry SimpleTypeNames[] = {
{"__bool64*", SimpleTypeKind::Boolean64},
};
-TypeDatabase::TypeDatabase(uint32_t ExpectedSize) : TypeNameStorage(Allocator) {
- CVUDTNames.reserve(ExpectedSize);
- TypeRecords.reserve(ExpectedSize);
+TypeDatabase::TypeDatabase(uint32_t Capacity) : TypeNameStorage(Allocator) {
+ CVUDTNames.resize(Capacity);
+ TypeRecords.resize(Capacity);
+ ValidRecords.resize(Capacity);
}
-/// Gets the type index for the next type record.
-TypeIndex TypeDatabase::getNextTypeIndex() const {
- return TypeIndex(TypeIndex::FirstNonSimpleIndex + CVUDTNames.size());
+TypeIndex TypeDatabase::appendType(StringRef Name, const CVType &Data) {
+ TypeIndex TI;
+ TI = getAppendIndex();
+ if (TI.toArrayIndex() >= capacity())
+ grow();
+ recordType(Name, TI, Data);
+ return TI;
}
-/// Records the name of a type, and reserves its type index.
-void TypeDatabase::recordType(StringRef Name, const CVType &Data) {
- CVUDTNames.push_back(Name);
- TypeRecords.push_back(Data);
+void TypeDatabase::recordType(StringRef Name, TypeIndex Index,
+ const CVType &Data) {
+ uint32_t AI = Index.toArrayIndex();
+
+ assert(!contains(Index));
+ assert(AI < capacity());
+
+ CVUDTNames[AI] = Name;
+ TypeRecords[AI] = Data;
+ ValidRecords.set(AI);
+ ++Count;
}
/// Saves the name in a StringSet and creates a stable StringRef.
@@ -104,24 +116,47 @@ StringRef TypeDatabase::getTypeName(TypeIndex Index) const {
return "<unknown simple type>";
}
- uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
- if (I < CVUDTNames.size())
- return CVUDTNames[I];
+ if (contains(Index))
+ return CVUDTNames[Index.toArrayIndex()];
return "<unknown UDT>";
}
const CVType &TypeDatabase::getTypeRecord(TypeIndex Index) const {
- return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex];
+ assert(contains(Index));
+ return TypeRecords[Index.toArrayIndex()];
}
CVType &TypeDatabase::getTypeRecord(TypeIndex Index) {
- return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex];
+ assert(contains(Index));
+ return TypeRecords[Index.toArrayIndex()];
+}
+
+bool TypeDatabase::contains(TypeIndex Index) const {
+ uint32_t AI = Index.toArrayIndex();
+ if (AI >= capacity())
+ return false;
+
+ return ValidRecords.test(AI);
}
-bool TypeDatabase::containsTypeIndex(TypeIndex Index) const {
- uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
- return I < CVUDTNames.size();
+uint32_t TypeDatabase::size() const { return Count; }
+
+uint32_t TypeDatabase::capacity() const { return TypeRecords.size(); }
+
+void TypeDatabase::grow() {
+ TypeRecords.emplace_back();
+ CVUDTNames.emplace_back();
+ ValidRecords.resize(ValidRecords.size() + 1);
}
-uint32_t TypeDatabase::size() const { return CVUDTNames.size(); }
+bool TypeDatabase::empty() const { return size() == 0; }
+
+TypeIndex TypeDatabase::getAppendIndex() const {
+ if (empty())
+ return TypeIndex::fromArrayIndex(0);
+
+ int Index = ValidRecords.find_last();
+ assert(Index != -1);
+ return TypeIndex::fromArrayIndex(Index) + 1;
+}
diff --git a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
index c234afd2288b..8d97f8b1cb40 100644
--- a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
@@ -15,7 +15,7 @@ using namespace llvm;
using namespace llvm::codeview;
-Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
+Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record) {
assert(!IsInFieldList);
// Reset Name to the empty string. If the visitor sets it, we know it.
Name = "";
@@ -28,6 +28,22 @@ Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
return Error::success();
}
+Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
+ if (auto EC = visitTypeBegin(Record))
+ return EC;
+
+ CurrentTypeIndex = Index;
+ return Error::success();
+}
+
+StringRef TypeDatabaseVisitor::getTypeName(TypeIndex Index) const {
+ return TypeDB->getTypeName(Index);
+}
+
+StringRef TypeDatabaseVisitor::saveTypeName(StringRef Name) {
+ return TypeDB->saveTypeName(Name);
+}
+
Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
if (CVR.Type == LF_FIELDLIST) {
assert(IsInFieldList);
@@ -39,7 +55,12 @@ Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
// CVUDTNames is indexed by type index, and must have one entry for every
// type. Field list members are not recorded, and are only referenced by
// their containing field list record.
- TypeDB.recordType(Name, CVR);
+ if (CurrentTypeIndex)
+ TypeDB->recordType(Name, *CurrentTypeIndex, CVR);
+ else
+ TypeDB->appendType(Name, CVR);
+
+ CurrentTypeIndex.reset();
return Error::success();
}
@@ -73,13 +94,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
uint32_t Size = Indices.size();
SmallString<256> TypeName("(");
for (uint32_t I = 0; I < Size; ++I) {
- StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+ StringRef ArgTypeName = getTypeName(Indices[I]);
TypeName.append(ArgTypeName);
if (I + 1 != Size)
TypeName.append(", ");
}
TypeName.push_back(')');
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
return Error::success();
}
@@ -89,13 +110,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
uint32_t Size = Indices.size();
SmallString<256> TypeName("\"");
for (uint32_t I = 0; I < Size; ++I) {
- StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+ StringRef ArgTypeName = getTypeName(Indices[I]);
TypeName.append(ArgTypeName);
if (I + 1 != Size)
TypeName.append("\" \"");
}
TypeName.push_back('\"');
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
return Error::success();
}
@@ -132,26 +153,26 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
ProcedureRecord &Proc) {
- StringRef ReturnTypeName = TypeDB.getTypeName(Proc.getReturnType());
- StringRef ArgListTypeName = TypeDB.getTypeName(Proc.getArgumentList());
+ StringRef ReturnTypeName = getTypeName(Proc.getReturnType());
+ StringRef ArgListTypeName = getTypeName(Proc.getArgumentList());
SmallString<256> TypeName(ReturnTypeName);
TypeName.push_back(' ');
TypeName.append(ArgListTypeName);
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
return Error::success();
}
Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
MemberFunctionRecord &MF) {
- StringRef ReturnTypeName = TypeDB.getTypeName(MF.getReturnType());
- StringRef ClassTypeName = TypeDB.getTypeName(MF.getClassType());
- StringRef ArgListTypeName = TypeDB.getTypeName(MF.getArgumentList());
+ StringRef ReturnTypeName = getTypeName(MF.getReturnType());
+ StringRef ClassTypeName = getTypeName(MF.getClassType());
+ StringRef ArgListTypeName = getTypeName(MF.getArgumentList());
SmallString<256> TypeName(ReturnTypeName);
TypeName.push_back(' ');
TypeName.append(ClassTypeName);
TypeName.append("::");
TypeName.append(ArgListTypeName);
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
return Error::success();
}
@@ -171,13 +192,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
if (Ptr.isPointerToMember()) {
const MemberPointerInfo &MI = Ptr.getMemberInfo();
- StringRef PointeeName = TypeDB.getTypeName(Ptr.getReferentType());
- StringRef ClassName = TypeDB.getTypeName(MI.getContainingType());
+ StringRef PointeeName = getTypeName(Ptr.getReferentType());
+ StringRef ClassName = getTypeName(MI.getContainingType());
SmallString<256> TypeName(PointeeName);
TypeName.push_back(' ');
TypeName.append(ClassName);
TypeName.append("::*");
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
} else {
SmallString<256> TypeName;
if (Ptr.isConst())
@@ -187,7 +208,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
if (Ptr.isUnaligned())
TypeName.append("__unaligned ");
- TypeName.append(TypeDB.getTypeName(Ptr.getReferentType()));
+ TypeName.append(getTypeName(Ptr.getReferentType()));
if (Ptr.getMode() == PointerMode::LValueReference)
TypeName.append("&");
@@ -197,7 +218,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
TypeName.append("*");
if (!TypeName.empty())
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
}
return Error::success();
}
@@ -205,7 +226,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
- StringRef ModifiedName = TypeDB.getTypeName(Mod.getModifiedType());
+ StringRef ModifiedName = getTypeName(Mod.getModifiedType());
SmallString<256> TypeName;
if (Mods & uint16_t(ModifierOptions::Const))
TypeName.append("const ");
@@ -214,14 +235,14 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
if (Mods & uint16_t(ModifierOptions::Unaligned))
TypeName.append("__unaligned ");
TypeName.append(ModifiedName);
- Name = TypeDB.saveTypeName(TypeName);
+ Name = saveTypeName(TypeName);
return Error::success();
}
Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
VFTableShapeRecord &Shape) {
- Name = TypeDB.saveTypeName("<vftable " + utostr(Shape.getEntryCount()) +
- " methods>");
+ Name =
+ saveTypeName("<vftable " + utostr(Shape.getEntryCount()) + " methods>");
return Error::success();
}
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 870d95221e7d..27a6e0987886 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -173,10 +173,13 @@ void TypeDumpVisitor::printItemIndex(StringRef FieldName, TypeIndex TI) const {
}
Error TypeDumpVisitor::visitTypeBegin(CVType &Record) {
+ TypeIndex TI = getSourceDB().getAppendIndex();
+ return visitTypeBegin(Record, TI);
+}
+
+Error TypeDumpVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
W->startLine() << getLeafTypeName(Record.Type);
- W->getOStream() << " ("
- << HexNumber(getSourceDB().getNextTypeIndex().getIndex())
- << ")";
+ W->getOStream() << " (" << HexNumber(Index.getIndex()) << ")";
W->getOStream() << " {\n";
W->indent();
W->printEnum("TypeLeafKind", unsigned(Record.Type),
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 246899ac12b9..59a060d143ff 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -66,7 +66,7 @@ uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size,
RelocAddrMap::const_iterator AI = Relocs->find(*Off);
if (AI == Relocs->end())
return Data.getUnsigned(Off, Size);
- return Data.getUnsigned(Off, Size) + AI->second.second;
+ return Data.getUnsigned(Off, Size) + AI->second.Value;
}
static void dumpAccelSection(raw_ostream &OS, StringRef Name,
@@ -905,16 +905,23 @@ static Error createError(const Twine &Reason, llvm::Error E) {
/// Returns the address of symbol relocation used against. Used for futher
/// relocations computation. Symbol's section load address is taken in account if
/// LoadedObjectInfo interface is provided.
-static Expected<uint64_t> getSymbolAddress(const object::ObjectFile &Obj,
- const RelocationRef &Reloc,
- const LoadedObjectInfo *L) {
+static Expected<uint64_t>
+getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc,
+ const LoadedObjectInfo *L,
+ std::map<SymbolRef, uint64_t> &Cache) {
uint64_t Ret = 0;
object::section_iterator RSec = Obj.section_end();
object::symbol_iterator Sym = Reloc.getSymbol();
+ std::map<SymbolRef, uint64_t>::iterator CacheIt = Cache.end();
// First calculate the address of the symbol or section as it appears
// in the object file
if (Sym != Obj.symbol_end()) {
+ bool New;
+ std::tie(CacheIt, New) = Cache.insert({*Sym, 0});
+ if (!New)
+ return CacheIt->second;
+
Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
if (!SymAddrOrErr)
return createError("error: failed to compute symbol address: ",
@@ -943,6 +950,10 @@ static Expected<uint64_t> getSymbolAddress(const object::ObjectFile &Obj,
if (L && RSec != Obj.section_end())
if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec))
Ret += SectionLoadAddress - RSec->getAddress();
+
+ if (CacheIt != Cache.end())
+ CacheIt->second = Ret;
+
return Ret;
}
@@ -1075,6 +1086,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
continue;
}
+ std::map<SymbolRef, uint64_t> AddrCache;
if (Section.relocation_begin() != Section.relocation_end()) {
uint64_t SectionSize = RelocatedSection->getSize();
for (const RelocationRef &Reloc : Section.relocations()) {
@@ -1083,7 +1095,8 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
if (isRelocScattered(Obj, Reloc))
continue;
- Expected<uint64_t> SymAddrOrErr = getSymbolAddress(Obj, Reloc, L);
+ Expected<uint64_t> SymAddrOrErr =
+ getSymbolAddress(Obj, Reloc, L, AddrCache);
if (!SymAddrOrErr) {
errs() << toString(SymAddrOrErr.takeError()) << '\n';
continue;
@@ -1114,7 +1127,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
<< " at " << format("%p", Address)
<< " with width " << format("%d", R.Width)
<< "\n");
- Map->insert(std::make_pair(Address, std::make_pair(R.Width, R.Value)));
+ Map->insert({Address, {(uint8_t)R.Width, R.Value}});
}
}
}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index 0cf71f530446..6601393d7459 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -54,9 +54,8 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
if (ParsedCUOffsets.insert(CUOffset).second) {
DWARFAddressRangesVector CURanges;
CU->collectAddressRanges(CURanges);
- for (const auto &R : CURanges) {
- appendRange(CUOffset, R.first, R.second);
- }
+ for (const auto &R : CURanges)
+ appendRange(CUOffset, R.LowPC, R.HighPC);
}
}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index 9380fe8fe85d..8da797750abd 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -69,8 +69,8 @@ DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const {
if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
BaseAddress = RLE.EndAddress;
} else {
- Res.push_back(std::make_pair(BaseAddress + RLE.StartAddress,
- BaseAddress + RLE.EndAddress));
+ Res.push_back(
+ {BaseAddress + RLE.StartAddress, BaseAddress + RLE.EndAddress});
}
}
return Res;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 24039eb35209..e3bd759ba94b 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -60,8 +60,8 @@ static void dumpRanges(raw_ostream &OS, const DWARFAddressRangesVector& Ranges,
OS << '\n';
OS.indent(Indent);
OS << format("[0x%0*" PRIx64 " - 0x%0*" PRIx64 ")",
- AddressSize*2, Range.first,
- AddressSize*2, Range.second);
+ AddressSize*2, Range.LowPC,
+ AddressSize*2, Range.HighPC);
}
}
@@ -229,9 +229,9 @@ DWARFDie::getAddressRanges() const {
return DWARFAddressRangesVector();
// Single range specified by low/high PC.
uint64_t LowPC, HighPC;
- if (getLowAndHighPC(LowPC, HighPC)) {
- return DWARFAddressRangesVector(1, std::make_pair(LowPC, HighPC));
- }
+ if (getLowAndHighPC(LowPC, HighPC))
+ return {{LowPC, HighPC}};
+
// Multiple ranges from .debug_ranges section.
auto RangesOffset = toSectionOffset(find(DW_AT_ranges));
if (RangesOffset) {
@@ -257,7 +257,7 @@ DWARFDie::collectChildrenAddressRanges(DWARFAddressRangesVector& Ranges) const {
bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const {
for (const auto& R : getAddressRanges()) {
- if (R.first <= Address && Address < R.second)
+ if (R.LowPC <= Address && Address < R.HighPC)
return true;
}
return false;
diff --git a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index e0f819383289..25824f6eb83b 100644
--- a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -24,7 +24,11 @@ bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
return false;
TypeHash = debug_info.getU64(offset_ptr);
TypeOffset = debug_info.getU32(offset_ptr);
- return TypeOffset < getLength();
+ // TypeOffset is relative to the beginning of the header,
+ // so we have to account for the leading length field.
+ // FIXME: The size of the length field is 12 in DWARF64.
+ unsigned SizeOfLength = 4;
+ return TypeOffset < getLength() + SizeOfLength;
}
void DWARFTypeUnit::dump(raw_ostream &OS, bool SummarizeTypes) {
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index f50487fc3ba3..3835d4da9ae9 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -349,18 +349,18 @@ void DWARFUnit::updateAddressDieMap(DWARFDie Die) {
if (Die.isSubroutineDIE()) {
for (const auto &R : Die.getAddressRanges()) {
// Ignore 0-sized ranges.
- if (R.first == R.second)
+ if (R.LowPC == R.HighPC)
continue;
- auto B = AddrDieMap.upper_bound(R.first);
- if (B != AddrDieMap.begin() && R.first < (--B)->second.first) {
+ auto B = AddrDieMap.upper_bound(R.LowPC);
+ if (B != AddrDieMap.begin() && R.LowPC < (--B)->second.first) {
// The range is a sub-range of existing ranges, we need to split the
// existing range.
- if (R.second < B->second.first)
- AddrDieMap[R.second] = B->second;
- if (R.first > B->first)
- AddrDieMap[B->first].first = R.first;
+ if (R.HighPC < B->second.first)
+ AddrDieMap[R.HighPC] = B->second;
+ if (R.LowPC > B->first)
+ AddrDieMap[B->first].first = R.LowPC;
}
- AddrDieMap[R.first] = std::make_pair(R.second, Die);
+ AddrDieMap[R.LowPC] = std::make_pair(R.HighPC, Die);
}
}
// Parent DIEs are added to the AddrDieMap prior to the Children DIEs to
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 9494e876da15..8a544296f65c 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
using namespace dwarf;
using namespace object;
-void DWARFVerifier::verifyDebugInfoAttribute(DWARFDie &Die,
+void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
DWARFAttribute &AttrValue) {
const auto Attr = AttrValue.Attr;
switch (Attr) {
@@ -68,7 +68,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(DWARFDie &Die,
}
}
-void DWARFVerifier::verifyDebugInfoForm(DWARFDie &Die,
+void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
DWARFAttribute &AttrValue) {
const auto Form = AttrValue.Value.getForm();
switch (Form) {
@@ -136,7 +136,7 @@ void DWARFVerifier::verifyDebugInfoForm(DWARFDie &Die,
}
}
-void DWARFVerifier::veifyDebugInfoReferences() {
+void DWARFVerifier::verifyDebugInfoReferences() {
// Take all references and make sure they point to an actual DIE by
// getting the DIE by offset and emitting an error
OS << "Verifying .debug_info references...\n";
@@ -172,7 +172,7 @@ bool DWARFVerifier::handleDebugInfo() {
}
}
}
- veifyDebugInfoReferences();
+ verifyDebugInfoReferences();
return NumDebugInfoErrors == 0;
}
diff --git a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 375c35b11145..701a318511b8 100644
--- a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -109,7 +109,7 @@ uint32_t TpiStreamBuilder::calculateHashBufferSize() const {
}
uint32_t TpiStreamBuilder::calculateIndexOffsetSize() const {
- return TypeIndexOffsets.size() * sizeof(TypeIndexOffset);
+ return TypeIndexOffsets.size() * sizeof(codeview::TypeIndexOffset);
}
Error TpiStreamBuilder::finalizeMsfLayout() {
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index a5100a56bcf1..a27573f93b97 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -94,9 +94,8 @@ class OrcMCJITReplacement : public ExecutionEngine {
return ClientMM->registerEHFrames(Addr, LoadAddr, Size);
}
- void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
- size_t Size) override {
- return ClientMM->deregisterEHFrames(Addr, LoadAddr, Size);
+ void deregisterEHFrames() override {
+ return ClientMM->deregisterEHFrames();
}
void notifyObjectLoaded(RuntimeDyld &RTDyld,
diff --git a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index de73fbde8eb7..99e84b7496d4 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -134,6 +134,18 @@ void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr,
#endif
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
+ size_t Size) {
+ registerEHFramesInProcess(Addr, Size);
+ EHFrames.push_back({Addr, Size});
+}
+
+void RTDyldMemoryManager::deregisterEHFrames() {
+ for (auto &Frame : EHFrames)
+ deregisterEHFramesInProcess(Frame.Addr, Frame.Size);
+ EHFrames.clear();
+}
+
static int jit_noop() {
return 0;
}
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index df9d2ceba329..e9a4b71c903d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -73,7 +73,9 @@ namespace llvm {
void RuntimeDyldImpl::registerEHFrames() {}
-void RuntimeDyldImpl::deregisterEHFrames() {}
+void RuntimeDyldImpl::deregisterEHFrames() {
+ MemMgr.deregisterEHFrames();
+}
#ifndef NDEBUG
static void dumpSectionMemory(const SectionEntry &S, StringRef State) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 50f63fb8dd39..660843765b3f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -221,22 +221,10 @@ void RuntimeDyldELF::registerEHFrames() {
uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
size_t EHFrameSize = Sections[EHFrameSID].getSize();
MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
- RegisteredEHFrameSections.push_back(EHFrameSID);
}
UnregisteredEHFrameSections.clear();
}
-void RuntimeDyldELF::deregisterEHFrames() {
- for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) {
- SID EHFrameSID = RegisteredEHFrameSections[i];
- uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress();
- uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
- size_t EHFrameSize = Sections[EHFrameSID].getSize();
- MemMgr.deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
- }
- RegisteredEHFrameSections.clear();
-}
-
std::unique_ptr<RuntimeDyldELF>
llvm::RuntimeDyldELF::create(Triple::ArchType Arch,
RuntimeDyld::MemoryManager &MemMgr,
@@ -802,20 +790,35 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
writeInt32BE(LocalAddress, Delta / 2);
break;
}
+ case ELF::R_390_PC16: {
+ int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
+ assert(int16_t(Delta) == Delta && "R_390_PC16 overflow");
+ writeInt16BE(LocalAddress, Delta);
+ break;
+ }
case ELF::R_390_PC32: {
int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
assert(int32_t(Delta) == Delta && "R_390_PC32 overflow");
writeInt32BE(LocalAddress, Delta);
break;
}
- case ELF::R_390_64:
- writeInt64BE(LocalAddress, Value + Addend);
- break;
case ELF::R_390_PC64: {
int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
writeInt64BE(LocalAddress, Delta);
break;
}
+ case ELF::R_390_8:
+ *LocalAddress = (uint8_t)(Value + Addend);
+ break;
+ case ELF::R_390_16:
+ writeInt16BE(LocalAddress, Value + Addend);
+ break;
+ case ELF::R_390_32:
+ writeInt32BE(LocalAddress, Value + Addend);
+ break;
+ case ELF::R_390_64:
+ writeInt64BE(LocalAddress, Value + Addend);
+ break;
}
}
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 84dd810101f3..fb5da6dd8bbb 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -152,7 +152,6 @@ private:
// in a table until we receive a request to register all unregistered
// EH frame sections with the memory manager.
SmallVector<SID, 2> UnregisteredEHFrameSections;
- SmallVector<SID, 2> RegisteredEHFrameSections;
// Map between GOT relocation value and corresponding GOT offset
std::map<RelocationValueRef, uint64_t> GOTOffsetMap;
@@ -180,7 +179,6 @@ public:
StubMap &Stubs) override;
bool isCompatibleFile(const object::ObjectFile &Obj) const override;
void registerEHFrames() override;
- void deregisterEHFrames() override;
Error finalizeLoad(const ObjectFile &Obj,
ObjSectionToIDMap &SectionMap) override;
};
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index f5cc883d98fd..18c23c5a2a5d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -515,7 +515,7 @@ public:
virtual void registerEHFrames();
- virtual void deregisterEHFrames();
+ void deregisterEHFrames();
virtual Error finalizeLoad(const ObjectFile &ObjImg,
ObjSectionToIDMap &SectionMap) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
index 0398413e1532..6aa1a2bdb926 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
@@ -217,7 +217,6 @@ public:
}
void registerEHFrames() override {}
- void deregisterEHFrames() override {}
};
}
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
index 8c6af0bd9c6d..318afa21a88b 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
@@ -316,7 +316,6 @@ public:
}
void registerEHFrames() override {}
- void deregisterEHFrames() override {}
};
}
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 109beb36f1ee..26e73989d7ed 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -194,9 +194,6 @@ public:
}
UnregisteredEHFrameSections.clear();
}
- void deregisterEHFrames() override {
- // Stub
- }
Error finalizeLoad(const ObjectFile &Obj,
ObjSectionToIDMap &SectionMap) override {
// Look for and record the EH frame section IDs.
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index b85ba210afb3..e93c79cfcec6 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -656,7 +656,8 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
SMR.WaitClient();
size_t Size = SMR.ReadByteArraySize();
SMR.WriteByteArray(nullptr, 0);
- F->RunOne(SMR.GetByteArray(), Size);
+ const Unit tmp(SMR.GetByteArray(), SMR.GetByteArray() + Size);
+ F->RunOne(tmp.data(), tmp.size());
SMR.PostServer();
}
return 0;
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 0a1ff1b1df6a..7ff196c8fa96 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -92,10 +92,10 @@ FUZZER_FLAG_INT(print_pcs, 0, "If 1, print out newly covered PCs.")
FUZZER_FLAG_INT(print_final_stats, 0, "If 1, print statistics at exit.")
FUZZER_FLAG_INT(print_corpus_stats, 0,
"If 1, print statistics on corpus elements at exit.")
-FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information at exit."
- " Experimental, only with trace-pc-guard")
-FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information at exit."
- " Experimental, only with trace-pc-guard")
+FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information as text"
+ " at exit.")
+FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information as a"
+ " .sancov file at exit.")
FUZZER_FLAG_INT(handle_segv, 1, "If 1, try to intercept SIGSEGV.")
FUZZER_FLAG_INT(handle_bus, 1, "If 1, try to intercept SIGBUS.")
FUZZER_FLAG_INT(handle_abrt, 1, "If 1, try to intercept SIGABRT.")
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index ad067ee2c0d9..5f184c2316e2 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -91,6 +91,7 @@ public:
private:
void AlarmCallback();
void CrashCallback();
+ void CrashOnOverwrittenData();
void InterruptCallback();
void MutateAndTestOne();
void ReportNewCoverage(InputInfo *II, const Unit &U);
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index d84c3dbdaf77..14caa203c5ef 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -422,6 +422,24 @@ size_t Fuzzer::GetCurrentUnitInFuzzingThead(const uint8_t **Data) const {
return CurrentUnitSize;
}
+void Fuzzer::CrashOnOverwrittenData() {
+ Printf("==%d== ERROR: libFuzzer: fuzz target overwrites it's const input\n",
+ GetPid());
+ DumpCurrentUnit("crash-");
+ Printf("SUMMARY: libFuzzer: out-of-memory\n");
+ _Exit(Options.ErrorExitCode); // Stop right now.
+}
+
+// Compare two arrays, but not all bytes if the arrays are large.
+static bool LooseMemeq(const uint8_t *A, const uint8_t *B, size_t Size) {
+ const size_t Limit = 64;
+ if (Size <= 64)
+ return !memcmp(A, B, Size);
+ // Compare first and last Limit/2 bytes.
+ return !memcmp(A, B, Limit / 2) &&
+ !memcmp(A + Size - Limit / 2, B + Size - Limit / 2, Limit / 2);
+}
+
void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
assert(InFuzzingThread());
if (SMR.IsClient())
@@ -443,6 +461,8 @@ void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
(void)Res;
assert(Res == 0);
HasMoreMallocsThanFrees = AllocTracer.Stop();
+ if (!LooseMemeq(DataCopy, Data, Size))
+ CrashOnOverwrittenData();
CurrentUnitSize = 0;
delete[] DataCopy;
}
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
index cd846c7deec5..e60d4130de10 100644
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -217,11 +217,12 @@ DictionaryEntry MutationDispatcher::MakeDictionaryEntryFromCMP(
size_t NumPositions = 0;
for (const uint8_t *Cur = Data;
Cur < End && NumPositions < kMaxNumPositions; Cur++) {
- Cur = (uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize);
+ Cur =
+ (const uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize);
if (!Cur) break;
Positions[NumPositions++] = Cur - Data;
}
- if (!NumPositions) break;
+ if (!NumPositions) continue;
return DictionaryEntry(W, Positions[Rand(NumPositions)]);
}
DictionaryEntry DE(W);
diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp
index b3a54e57fceb..3815ed11cf60 100644
--- a/lib/Fuzzer/afl/afl_driver.cpp
+++ b/lib/Fuzzer/afl/afl_driver.cpp
@@ -59,6 +59,11 @@ statistics from the file. If that fails then the process will quit.
#include <signal.h>
#include <sys/resource.h>
#include <sys/time.h>
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+
// Platform detection. Copied from FuzzerInternal.h
#ifdef __linux__
#define LIBFUZZER_LINUX 1
@@ -245,17 +250,39 @@ extern "C" size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize) {
return 0;
}
+// Execute any files provided as parameters.
+int ExecuteFilesOnyByOne(int argc, char **argv) {
+ for (int i = 1; i < argc; i++) {
+ std::ifstream in(argv[i]);
+ in.seekg(0, in.end);
+ size_t length = in.tellg();
+ in.seekg (0, in.beg);
+ std::cout << "Reading " << length << " bytes from " << argv[i] << std::endl;
+ // Allocate exactly length bytes so that we reliably catch buffer overflows.
+ std::vector<char> bytes(length);
+ in.read(bytes.data(), bytes.size());
+ assert(in);
+ LLVMFuzzerTestOneInput(reinterpret_cast<const uint8_t *>(bytes.data()),
+ bytes.size());
+ std::cout << "Execution successfull" << std::endl;
+ }
+ return 0;
+}
+
int main(int argc, char **argv) {
- fprintf(stderr, "======================= INFO =========================\n"
- "This binary is built for AFL-fuzz.\n"
- "To run the target function on a single input execute this:\n"
- " %s < INPUT_FILE\n"
- "To run the fuzzing execute this:\n"
- " afl-fuzz [afl-flags] %s [N] "
- "-- run N fuzzing iterations before "
- "re-spawning the process (default: 1000)\n"
- "======================================================\n",
- argv[0], argv[0]);
+ fprintf(stderr,
+ "======================= INFO =========================\n"
+ "This binary is built for AFL-fuzz.\n"
+ "To run the target function on individual input(s) execute this:\n"
+ " %s < INPUT_FILE\n"
+ "or\n"
+ " %s INPUT_FILE1 [INPUT_FILE2 ... ]\n"
+ "To fuzz with afl-fuzz execute this:\n"
+ " afl-fuzz [afl-flags] %s [-N]\n"
+ "afl-fuzz will run N iterations before "
+ "re-spawning the process (default: 1000)\n"
+ "======================================================\n",
+ argv[0], argv[0], argv[0]);
if (LLVMFuzzerInitialize)
LLVMFuzzerInitialize(&argc, &argv);
// Do any other expensive one-time initialization here.
@@ -266,8 +293,14 @@ int main(int argc, char **argv) {
__afl_manual_init();
int N = 1000;
- if (argc >= 2)
- N = atoi(argv[1]);
+ if (argc == 2 && argv[1][0] == '-')
+ N = atoi(argv[1] + 1);
+ else if(argc == 2 && (N = atoi(argv[1])) > 0)
+ fprintf(stderr, "WARNING: using the deprecated call style `%s %d`\n",
+ argv[0], N);
+ else if (argc > 1)
+ return ExecuteFilesOnyByOne(argc, argv);
+
assert(N > 0);
time_t unit_time_secs;
int num_runs = 0;
diff --git a/lib/Fuzzer/test/AFLDriverTest.cpp b/lib/Fuzzer/test/AFLDriverTest.cpp
index 3dd0b6117305..e3f5f7100883 100644
--- a/lib/Fuzzer/test/AFLDriverTest.cpp
+++ b/lib/Fuzzer/test/AFLDriverTest.cpp
@@ -4,19 +4,25 @@
// Contains dummy functions used to avoid dependency on AFL.
#include <stdint.h>
#include <stdlib.h>
+#include <stdio.h>
extern "C" void __afl_manual_init() {}
-extern "C" int __afl_persistent_loop(unsigned int) {
+extern "C" int __afl_persistent_loop(unsigned int N) {
+ static int Count = N;
+ fprintf(stderr, "__afl_persistent_loop calle, Count = %d\n", Count);
+ if (Count--) return 1;
return 0;
}
// This declaration exists to prevent the Darwin linker
// from complaining about this being a missing weak symbol.
extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) {
+ fprintf(stderr, "LLVMFuzzerInitialize called\n");
return 0;
}
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+ fprintf(stderr, "LLVMFuzzerTestOneInput called; Size = %zd\n", Size);
return 0;
}
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index cd049d3f03d8..b39938a705f6 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -104,6 +104,7 @@ set(Tests
OneHugeAllocTest
OutOfMemoryTest
OutOfMemorySingleLargeMallocTest
+ OverwriteInputTest
RepeatedMemcmp
RepeatedBytesTest
SimpleCmpTest
diff --git a/lib/Fuzzer/test/OverwriteInputTest.cpp b/lib/Fuzzer/test/OverwriteInputTest.cpp
new file mode 100644
index 000000000000..e688682346a6
--- /dev/null
+++ b/lib/Fuzzer/test/OverwriteInputTest.cpp
@@ -0,0 +1,13 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. Make sure we abort if Data is overwritten.
+#include <cstdint>
+#include <iostream>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+ if (Size)
+ *const_cast<uint8_t*>(Data) = 1;
+ return 0;
+}
+
diff --git a/lib/Fuzzer/test/afl-driver.test b/lib/Fuzzer/test/afl-driver.test
new file mode 100644
index 000000000000..6eab23cc3636
--- /dev/null
+++ b/lib/Fuzzer/test/afl-driver.test
@@ -0,0 +1,26 @@
+REQUIRES: linux
+RUN: echo -n "abc" > %t.file3
+RUN: echo -n "abcd" > %t.file4
+
+RUN: AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefix=CHECK1
+CHECK1: __afl_persistent_loop calle, Count = 1000
+CHECK1: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest < %t.file3 -42 2>&1 | FileCheck %s --check-prefix=CHECK2
+CHECK2: __afl_persistent_loop calle, Count = 42
+CHECK2: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest < %t.file3 666 2>&1 | FileCheck %s --check-prefix=CHECK3
+CHECK3: WARNING: using the deprecated call style
+CHECK3: __afl_persistent_loop calle, Count = 666
+CHECK3: LLVMFuzzerTestOneInput called; Size = 3
+
+
+RUN: AFLDriverTest %t.file3 2>&1 | FileCheck %s --check-prefix=CHECK4
+CHECK4: LLVMFuzzerTestOneInput called; Size = 3
+
+RUN: AFLDriverTest %t.file3 %t.file4 2>&1 | FileCheck %s --check-prefix=CHECK5
+CHECK5: LLVMFuzzerTestOneInput called; Size = 3
+CHECK5: LLVMFuzzerTestOneInput called; Size = 4
diff --git a/lib/Fuzzer/test/overwrite-input.test b/lib/Fuzzer/test/overwrite-input.test
new file mode 100644
index 000000000000..81c27909e8df
--- /dev/null
+++ b/lib/Fuzzer/test/overwrite-input.test
@@ -0,0 +1,2 @@
+RUN: not LLVMFuzzer-OverwriteInputTest 2>&1 | FileCheck %s
+CHECK: ERROR: libFuzzer: fuzz target overwrites it's const input
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 4c6e3e3788bd..ec4663018bd4 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -805,6 +805,9 @@ void SlotTracker::processModule() {
if (!Var.hasName())
CreateModuleSlot(&Var);
processGlobalObjectMetadata(Var);
+ auto Attrs = Var.getAttributes();
+ if (Attrs.hasAttributes())
+ CreateAttributeSetSlot(Attrs);
}
for (const GlobalAlias &A : TheModule->aliases()) {
@@ -2502,6 +2505,10 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
GV->getAllMetadata(MDs);
printMetadataAttachments(MDs, ", ");
+ auto Attrs = GV->getAttributes();
+ if (Attrs.hasAttributes())
+ Out << " #" << Machine.getAttributeGroupSlot(Attrs);
+
printInfoComment(*GV);
}
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index cf2925254695..acfac316e91e 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -1,4 +1,4 @@
-//===-- AttributeImpl.h - Attribute Internals -------------------*- C++ -*-===//
+//===- AttributeImpl.h - Attribute Internals --------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -21,9 +21,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Attributes.h"
#include "llvm/Support/TrailingObjects.h"
-#include <algorithm>
#include <cassert>
-#include <climits>
#include <cstddef>
#include <cstdint>
#include <string>
@@ -80,11 +78,13 @@ public:
else
Profile(ID, getKindAsString(), getValueAsString());
}
+
static void Profile(FoldingSetNodeID &ID, Attribute::AttrKind Kind,
uint64_t Val) {
ID.AddInteger(Kind);
if (Val) ID.AddInteger(Val);
}
+
static void Profile(FoldingSetNodeID &ID, StringRef Kind, StringRef Values) {
ID.AddString(Kind);
if (!Values.empty()) ID.AddString(Values);
@@ -114,9 +114,10 @@ public:
};
class IntAttributeImpl : public EnumAttributeImpl {
- void anchor() override;
uint64_t Val;
+ void anchor() override;
+
public:
IntAttributeImpl(Attribute::AttrKind Kind, uint64_t Val)
: EnumAttributeImpl(IntAttrEntry, Kind), Val(Val) {
@@ -188,20 +189,22 @@ public:
std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
std::string getAsString(bool InAttrGrp) const;
- typedef const Attribute *iterator;
+ using iterator = const Attribute *;
+
iterator begin() const { return getTrailingObjects<Attribute>(); }
iterator end() const { return begin() + NumAttrs; }
void Profile(FoldingSetNodeID &ID) const {
Profile(ID, makeArrayRef(begin(), end()));
}
+
static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
for (const auto &Attr : AttrList)
Attr.Profile(ID);
}
};
-typedef std::pair<unsigned, AttributeSet> IndexAttrPair;
+using IndexAttrPair = std::pair<unsigned, AttributeSet>;
//===----------------------------------------------------------------------===//
/// \class
@@ -265,7 +268,8 @@ public:
return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
}
- typedef AttributeSet::iterator iterator;
+ using iterator = AttributeSet::iterator;
+
iterator begin(unsigned Slot) const {
return getSlotAttributes(Slot).begin();
}
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 3b1140ab542c..ce60367a6c8b 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -34,6 +34,8 @@
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
+#include <climits>
+#include <cstddef>
#include <cstdint>
#include <limits>
#include <map>
@@ -504,16 +506,74 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) {
return AttributeSet(AttributeSetNode::get(C, Attrs));
}
+AttributeSet AttributeSet::addAttribute(LLVMContext &C,
+ Attribute::AttrKind Kind) const {
+ if (hasAttribute(Kind)) return *this;
+ AttrBuilder B;
+ B.addAttribute(Kind);
+ return addAttributes(C, AttributeSet::get(C, B));
+}
+
+AttributeSet AttributeSet::addAttribute(LLVMContext &C, StringRef Kind,
+ StringRef Value) const {
+ AttrBuilder B;
+ B.addAttribute(Kind, Value);
+ return addAttributes(C, AttributeSet::get(C, B));
+}
+
+AttributeSet AttributeSet::addAttributes(LLVMContext &C,
+ const AttributeSet AS) const {
+ if (!hasAttributes())
+ return AS;
+
+ if (!AS.hasAttributes())
+ return *this;
+
+ AttrBuilder B(AS);
+ for (Attribute I : *this)
+ B.addAttribute(I);
+
+ return get(C, B);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
+ Attribute::AttrKind Kind) const {
+ if (!hasAttribute(Kind)) return *this;
+ AttrBuilder B;
+ B.addAttribute(Kind);
+ return removeAttributes(C, B);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
+ StringRef Kind) const {
+ if (!hasAttribute(Kind)) return *this;
+ AttrBuilder B;
+ B.addAttribute(Kind);
+ return removeAttributes(C, B);
+}
+
+AttributeSet AttributeSet::removeAttributes(LLVMContext &C,
+ const AttrBuilder &Attrs) const {
+
+ // FIXME it is not obvious how this should work for alignment.
+ // For now, say we can't pass in alignment, which no current use does.
+ assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!");
+
+ AttrBuilder B(*this);
+ B.remove(Attrs);
+ return get(C, B);
+}
+
unsigned AttributeSet::getNumAttributes() const {
return SetNode ? SetNode->getNumAttributes() : 0;
}
bool AttributeSet::hasAttribute(Attribute::AttrKind Kind) const {
- return SetNode ? SetNode->hasAttribute(Kind) : 0;
+ return SetNode ? SetNode->hasAttribute(Kind) : false;
}
bool AttributeSet::hasAttribute(StringRef Kind) const {
- return SetNode ? SetNode->hasAttribute(Kind) : 0;
+ return SetNode ? SetNode->hasAttribute(Kind) : false;
}
Attribute AttributeSet::getAttribute(Attribute::AttrKind Kind) const {
@@ -557,6 +617,14 @@ AttributeSet::iterator AttributeSet::end() const {
return SetNode ? SetNode->end() : nullptr;
}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void AttributeSet::dump() const {
+ dbgs() << "AS =\n";
+ dbgs() << " { ";
+ dbgs() << getAsString(true) << " }\n";
+}
+#endif
+
//===----------------------------------------------------------------------===//
// AttributeSetNode Definition
//===----------------------------------------------------------------------===//
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 80b117015ede..a20f3f811c8d 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -2041,9 +2041,6 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
Optional<unsigned> InRangeIndex,
ArrayRef<Value *> Idxs) {
if (Idxs.empty()) return C;
- Constant *Idx0 = cast<Constant>(Idxs[0]);
- if ((Idxs.size() == 1 && Idx0->isNullValue()))
- return C;
if (isa<UndefValue>(C)) {
Type *GEPTy = GetElementPtrInst::getGEPReturnType(
@@ -2051,10 +2048,15 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
return UndefValue::get(GEPTy);
}
+ Constant *Idx0 = cast<Constant>(Idxs[0]);
+ if (Idxs.size() == 1 && (Idx0->isNullValue() || isa<UndefValue>(Idx0)))
+ return C;
+
if (C->isNullValue()) {
bool isNull = true;
for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
- if (!cast<Constant>(Idxs[i])->isNullValue()) {
+ if (!isa<UndefValue>(Idxs[i]) &&
+ !cast<Constant>(Idxs[i])->isNullValue()) {
isNull = false;
break;
}
diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp
index aeb1257754f3..509caba3acd4 100644
--- a/lib/IR/ConstantRange.cpp
+++ b/lib/IR/ConstantRange.cpp
@@ -278,7 +278,7 @@ APInt ConstantRange::getUnsignedMax() const {
}
APInt ConstantRange::getUnsignedMin() const {
- if (isFullSet() || (isWrappedSet() && getUpper() != 0))
+ if (isFullSet() || (isWrappedSet() && !getUpper().isNullValue()))
return APInt::getMinValue(getBitWidth());
return getLower();
}
@@ -442,7 +442,7 @@ ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const {
APInt L = CR.Lower.ult(Lower) ? CR.Lower : Lower;
APInt U = (CR.Upper - 1).ugt(Upper - 1) ? CR.Upper : Upper;
- if (L == 0 && U == 0)
+ if (L.isNullValue() && U.isNullValue())
return ConstantRange(getBitWidth());
return ConstantRange(std::move(L), std::move(U));
@@ -757,7 +757,8 @@ ConstantRange::multiply(const ConstantRange &Other) const {
// from one positive number to another which is as good as we can generate.
// In this case, skip the extra work of generating signed ranges which aren't
// going to be better than this range.
- if (!UR.isWrappedSet() && UR.getLower().isNonNegative())
+ if (!UR.isWrappedSet() &&
+ (UR.getUpper().isNonNegative() || UR.getUpper().isMinSignedValue()))
return UR;
// Now the signed range. Because we could be dealing with negative numbers
@@ -834,7 +835,7 @@ ConstantRange::umin(const ConstantRange &Other) const {
ConstantRange
ConstantRange::udiv(const ConstantRange &RHS) const {
- if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax() == 0)
+ if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax().isNullValue())
return ConstantRange(getBitWidth(), /*isFullSet=*/false);
if (RHS.isFullSet())
return ConstantRange(getBitWidth(), /*isFullSet=*/true);
@@ -842,7 +843,7 @@ ConstantRange::udiv(const ConstantRange &RHS) const {
APInt Lower = getUnsignedMin().udiv(RHS.getUnsignedMax());
APInt RHS_umin = RHS.getUnsignedMin();
- if (RHS_umin == 0) {
+ if (RHS_umin.isNullValue()) {
// We want the lowest value in RHS excluding zero. Usually that would be 1
// except for a range in the form of [X, 1) in which case it would be X.
if (RHS.getUpper() == 1)
@@ -892,29 +893,33 @@ ConstantRange::shl(const ConstantRange &Other) const {
if (isEmptySet() || Other.isEmptySet())
return ConstantRange(getBitWidth(), /*isFullSet=*/false);
- APInt min = getUnsignedMin().shl(Other.getUnsignedMin());
- APInt max = getUnsignedMax().shl(Other.getUnsignedMax());
+ APInt max = getUnsignedMax();
+ APInt Other_umax = Other.getUnsignedMax();
- // there's no overflow!
- APInt Zeros(getBitWidth(), getUnsignedMax().countLeadingZeros());
- if (Zeros.ugt(Other.getUnsignedMax()))
- return ConstantRange(std::move(min), std::move(max) + 1);
+ // there's overflow!
+ if (Other_umax.uge(max.countLeadingZeros()))
+ return ConstantRange(getBitWidth(), /*isFullSet=*/true);
// FIXME: implement the other tricky cases
- return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+ APInt min = getUnsignedMin();
+ min <<= Other.getUnsignedMin();
+ max <<= Other_umax;
+
+ return ConstantRange(std::move(min), std::move(max) + 1);
}
ConstantRange
ConstantRange::lshr(const ConstantRange &Other) const {
if (isEmptySet() || Other.isEmptySet())
return ConstantRange(getBitWidth(), /*isFullSet=*/false);
-
- APInt max = getUnsignedMax().lshr(Other.getUnsignedMin());
+
+ APInt max = getUnsignedMax().lshr(Other.getUnsignedMin()) + 1;
APInt min = getUnsignedMin().lshr(Other.getUnsignedMax());
- if (min == max + 1)
+ if (min == max)
return ConstantRange(getBitWidth(), /*isFullSet=*/true);
- return ConstantRange(std::move(min), std::move(max) + 1);
+ return ConstantRange(std::move(min), std::move(max));
}
ConstantRange ConstantRange::inverse() const {
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index ffc8f2e4303b..4b9d89cda539 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -30,7 +30,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
-#include <cstdarg>
+
using namespace llvm;
//===----------------------------------------------------------------------===//
@@ -966,16 +966,6 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V);
}
-Constant *ConstantStruct::get(StructType *T, ...) {
- va_list ap;
- SmallVector<Constant*, 8> Values;
- va_start(ap, T);
- while (Constant *Val = va_arg(ap, llvm::Constant*))
- Values.push_back(Val);
- va_end(ap);
- return get(T, Values);
-}
-
ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
: ConstantAggregate(T, ConstantVectorVal, V) {
assert(V.size() == T->getNumElements() &&
@@ -1810,8 +1800,7 @@ Constant *ConstantExpr::getSizeOf(Type* Ty) {
Constant *ConstantExpr::getAlignOf(Type* Ty) {
// alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1
// Note that a non-inbounds gep is used, as null isn't within any object.
- Type *AligningTy =
- StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, nullptr);
+ Type *AligningTy = StructType::get(Type::getInt1Ty(Ty->getContext()), Ty);
Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo(0));
Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0);
Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index eda751d8af4a..25eb9452d9d0 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -22,6 +22,7 @@
#include "llvm/ADT/None.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/InlineAsm.h"
@@ -387,31 +388,34 @@ struct ConstantExprKeyType;
template <class ConstantClass> struct ConstantInfo;
template <> struct ConstantInfo<ConstantExpr> {
- typedef ConstantExprKeyType ValType;
- typedef Type TypeClass;
+ using ValType = ConstantExprKeyType;
+ using TypeClass = Type;
};
template <> struct ConstantInfo<InlineAsm> {
- typedef InlineAsmKeyType ValType;
- typedef PointerType TypeClass;
+ using ValType = InlineAsmKeyType;
+ using TypeClass = PointerType;
};
template <> struct ConstantInfo<ConstantArray> {
- typedef ConstantAggrKeyType<ConstantArray> ValType;
- typedef ArrayType TypeClass;
+ using ValType = ConstantAggrKeyType<ConstantArray>;
+ using TypeClass = ArrayType;
};
template <> struct ConstantInfo<ConstantStruct> {
- typedef ConstantAggrKeyType<ConstantStruct> ValType;
- typedef StructType TypeClass;
+ using ValType = ConstantAggrKeyType<ConstantStruct>;
+ using TypeClass = StructType;
};
template <> struct ConstantInfo<ConstantVector> {
- typedef ConstantAggrKeyType<ConstantVector> ValType;
- typedef VectorType TypeClass;
+ using ValType = ConstantAggrKeyType<ConstantVector>;
+ using TypeClass = VectorType;
};
template <class ConstantClass> struct ConstantAggrKeyType {
ArrayRef<Constant *> Operands;
+
ConstantAggrKeyType(ArrayRef<Constant *> Operands) : Operands(Operands) {}
+
ConstantAggrKeyType(ArrayRef<Constant *> Operands, const ConstantClass *)
: Operands(Operands) {}
+
ConstantAggrKeyType(const ConstantClass *C,
SmallVectorImpl<Constant *> &Storage) {
assert(Storage.empty() && "Expected empty storage");
@@ -437,7 +441,8 @@ template <class ConstantClass> struct ConstantAggrKeyType {
return hash_combine_range(Operands.begin(), Operands.end());
}
- typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
+ using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
+
ConstantClass *create(TypeClass *Ty) const {
return new (Operands.size()) ConstantClass(Ty, Operands);
}
@@ -457,6 +462,7 @@ struct InlineAsmKeyType {
: AsmString(AsmString), Constraints(Constraints), FTy(FTy),
HasSideEffects(HasSideEffects), IsAlignStack(IsAlignStack),
AsmDialect(AsmDialect) {}
+
InlineAsmKeyType(const InlineAsm *Asm, SmallVectorImpl<Constant *> &)
: AsmString(Asm->getAsmString()), Constraints(Asm->getConstraintString()),
FTy(Asm->getFunctionType()), HasSideEffects(Asm->hasSideEffects()),
@@ -483,7 +489,8 @@ struct InlineAsmKeyType {
AsmDialect, FTy);
}
- typedef ConstantInfo<InlineAsm>::TypeClass TypeClass;
+ using TypeClass = ConstantInfo<InlineAsm>::TypeClass;
+
InlineAsm *create(TypeClass *Ty) const {
assert(PointerType::getUnqual(FTy) == Ty);
return new InlineAsm(FTy, AsmString, Constraints, HasSideEffects,
@@ -507,11 +514,13 @@ struct ConstantExprKeyType {
: Opcode(Opcode), SubclassOptionalData(SubclassOptionalData),
SubclassData(SubclassData), Ops(Ops), Indexes(Indexes),
ExplicitTy(ExplicitTy) {}
+
ConstantExprKeyType(ArrayRef<Constant *> Operands, const ConstantExpr *CE)
: Opcode(CE->getOpcode()),
SubclassOptionalData(CE->getRawSubclassOptionalData()),
SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands),
Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()) {}
+
ConstantExprKeyType(const ConstantExpr *CE,
SmallVectorImpl<Constant *> &Storage)
: Opcode(CE->getOpcode()),
@@ -553,7 +562,8 @@ struct ConstantExprKeyType {
hash_combine_range(Indexes.begin(), Indexes.end()));
}
- typedef ConstantInfo<ConstantExpr>::TypeClass TypeClass;
+ using TypeClass = ConstantInfo<ConstantExpr>::TypeClass;
+
ConstantExpr *create(TypeClass *Ty) const {
switch (Opcode) {
default:
@@ -594,16 +604,17 @@ struct ConstantExprKeyType {
template <class ConstantClass> class ConstantUniqueMap {
public:
- typedef typename ConstantInfo<ConstantClass>::ValType ValType;
- typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
- typedef std::pair<TypeClass *, ValType> LookupKey;
+ using ValType = typename ConstantInfo<ConstantClass>::ValType;
+ using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
+ using LookupKey = std::pair<TypeClass *, ValType>;
/// Key and hash together, so that we compute the hash only once and reuse it.
- typedef std::pair<unsigned, LookupKey> LookupKeyHashed;
+ using LookupKeyHashed = std::pair<unsigned, LookupKey>;
private:
struct MapInfo {
- typedef DenseMapInfo<ConstantClass *> ConstantClassInfo;
+ using ConstantClassInfo = DenseMapInfo<ConstantClass *>;
+
static inline ConstantClass *getEmptyKey() {
return ConstantClassInfo::getEmptyKey();
}
@@ -643,7 +654,7 @@ private:
};
public:
- typedef DenseSet<ConstantClass *, MapInfo> MapTy;
+ using MapTy = DenseSet<ConstantClass *, MapInfo>;
private:
MapTy Map;
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index cdbe237766a3..e6c49cad0722 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -672,6 +672,24 @@ void DIExpression::appendOffset(SmallVectorImpl<uint64_t> &Ops,
}
}
+bool DIExpression::extractIfOffset(int64_t &Offset) const {
+ if (getNumElements() == 0) {
+ Offset = 0;
+ return true;
+ }
+ if (getNumElements() != 2)
+ return false;
+ if (Elements[0] == dwarf::DW_OP_plus) {
+ Offset = Elements[1];
+ return true;
+ }
+ if (Elements[0] == dwarf::DW_OP_minus) {
+ Offset = -Elements[1];
+ return true;
+ }
+ return false;
+}
+
DIExpression *DIExpression::prepend(const DIExpression *Expr, bool Deref,
int64_t Offset, bool StackValue) {
SmallVector<uint64_t, 8> Ops;
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index f31074a7ad44..3168ec6944a3 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -8,6 +8,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "LLVMContextImpl.h"
#include "llvm/IR/DebugInfo.h"
using namespace llvm;
@@ -66,6 +67,119 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col, const MDNode *Scope,
const_cast<MDNode *>(InlinedAt));
}
+DebugLoc DebugLoc::appendInlinedAt(DebugLoc DL, DILocation *InlinedAt,
+ LLVMContext &Ctx,
+ DenseMap<const MDNode *, MDNode *> &Cache,
+ bool ReplaceLast) {
+ SmallVector<DILocation *, 3> InlinedAtLocations;
+ DILocation *Last = InlinedAt;
+ DILocation *CurInlinedAt = DL;
+
+ // Gather all the inlined-at nodes.
+ while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
+ // Skip any we've already built nodes for.
+ if (auto *Found = Cache[IA]) {
+ Last = cast<DILocation>(Found);
+ break;
+ }
+
+ if (ReplaceLast && !IA->getInlinedAt())
+ break;
+ InlinedAtLocations.push_back(IA);
+ CurInlinedAt = IA;
+ }
+
+ // Starting from the top, rebuild the nodes to point to the new inlined-at
+ // location (then rebuilding the rest of the chain behind it) and update the
+ // map of already-constructed inlined-at nodes.
+ for (const DILocation *MD : reverse(InlinedAtLocations))
+ Cache[MD] = Last = DILocation::getDistinct(
+ Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
+
+ return Last;
+}
+
+/// Reparent \c Scope from \c OrigSP to \c NewSP.
+static DIScope *reparentScope(LLVMContext &Ctx, DIScope *Scope,
+ DISubprogram *OrigSP, DISubprogram *NewSP,
+ DenseMap<const MDNode *, MDNode *> &Cache) {
+ SmallVector<DIScope *, 3> ScopeChain;
+ DIScope *Last = NewSP;
+ DIScope *CurScope = Scope;
+ do {
+ if (auto *SP = dyn_cast<DISubprogram>(CurScope)) {
+ // Don't rewrite this scope chain if it doesn't lead to the replaced SP.
+ if (SP != OrigSP)
+ return Scope;
+ Cache.insert({OrigSP, NewSP});
+ break;
+ }
+ if (auto *Found = Cache[CurScope]) {
+ Last = cast<DIScope>(Found);
+ break;
+ }
+ ScopeChain.push_back(CurScope);
+ } while ((CurScope = CurScope->getScope().resolve()));
+
+ // Starting from the top, rebuild the nodes to point to the new inlined-at
+ // location (then rebuilding the rest of the chain behind it) and update the
+ // map of already-constructed inlined-at nodes.
+ for (const DIScope *MD : reverse(ScopeChain)) {
+ if (auto *LB = dyn_cast<DILexicalBlock>(MD))
+ Cache[MD] = Last = DILexicalBlock::getDistinct(
+ Ctx, Last, LB->getFile(), LB->getLine(), LB->getColumn());
+ else if (auto *LB = dyn_cast<DILexicalBlockFile>(MD))
+ Cache[MD] = Last = DILexicalBlockFile::getDistinct(
+ Ctx, Last, LB->getFile(), LB->getDiscriminator());
+ else
+ llvm_unreachable("illegal parent scope");
+ }
+ return Last;
+}
+
+void DebugLoc::reparentDebugInfo(Instruction &I, DISubprogram *OrigSP,
+ DISubprogram *NewSP,
+ DenseMap<const MDNode *, MDNode *> &Cache) {
+ auto DL = I.getDebugLoc();
+ if (!OrigSP || !NewSP || OrigSP == NewSP || !DL)
+ return;
+
+ // Reparent the debug location.
+ auto &Ctx = I.getContext();
+ DILocation *InlinedAt = DL->getInlinedAt();
+ if (InlinedAt) {
+ while (auto *IA = InlinedAt->getInlinedAt())
+ InlinedAt = IA;
+ auto NewScope =
+ reparentScope(Ctx, InlinedAt->getScope(), OrigSP, NewSP, Cache);
+ InlinedAt =
+ DebugLoc::get(InlinedAt->getLine(), InlinedAt->getColumn(), NewScope);
+ }
+ I.setDebugLoc(
+ DebugLoc::get(DL.getLine(), DL.getCol(),
+ reparentScope(Ctx, DL->getScope(), OrigSP, NewSP, Cache),
+ DebugLoc::appendInlinedAt(DL, InlinedAt, Ctx, Cache,
+ ReplaceLastInlinedAt)));
+
+ // Fix up debug variables to point to NewSP.
+ auto reparentVar = [&](DILocalVariable *Var) {
+ return DILocalVariable::getDistinct(
+ Ctx,
+ cast<DILocalScope>(
+ reparentScope(Ctx, Var->getScope(), OrigSP, NewSP, Cache)),
+ Var->getName(), Var->getFile(), Var->getLine(), Var->getType(),
+ Var->getArg(), Var->getFlags(), Var->getAlignInBits());
+ };
+ if (auto *DbgValue = dyn_cast<DbgValueInst>(&I)) {
+ auto *Var = DbgValue->getVariable();
+ I.setOperand(2, MetadataAsValue::get(Ctx, reparentVar(Var)));
+ } else if (auto *DbgDeclare = dyn_cast<DbgDeclareInst>(&I)) {
+ auto *Var = DbgDeclare->getVariable();
+ I.setOperand(1, MetadataAsValue::get(Ctx, reparentVar(Var)));
+ }
+}
+
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void DebugLoc::dump() const {
if (!Loc)
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 395b6158e0c8..e73f53f3202d 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -12,20 +12,31 @@
// Diagnostics reporting is still done as part of the LLVMContext.
//===----------------------------------------------------------------------===//
-#include "llvm/IR/DiagnosticInfo.h"
-#include "LLVMContextImpl.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/Regex.h"
#include <atomic>
+#include <cassert>
+#include <memory>
#include <string>
using namespace llvm;
@@ -53,6 +64,8 @@ struct PassRemarksOpt {
}
};
+} // end anonymous namespace
+
static PassRemarksOpt PassRemarksOptLoc;
static PassRemarksOpt PassRemarksMissedOptLoc;
static PassRemarksOpt PassRemarksAnalysisOptLoc;
@@ -85,7 +98,6 @@ PassRemarksAnalysis(
"the given regular expression"),
cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired,
cl::ZeroOrMore);
-}
int llvm::getNextAvailablePluginDiagnosticKind() {
static std::atomic<int> PluginKindID(DK_FirstPluginKind);
@@ -97,8 +109,7 @@ const char *OptimizationRemarkAnalysis::AlwaysPrint = "";
DiagnosticInfoInlineAsm::DiagnosticInfoInlineAsm(const Instruction &I,
const Twine &MsgStr,
DiagnosticSeverity Severity)
- : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(0), MsgStr(MsgStr),
- Instr(&I) {
+ : DiagnosticInfo(DK_InlineAsm, Severity), MsgStr(MsgStr), Instr(&I) {
if (const MDNode *SrcLoc = I.getMetadata("srcloc")) {
if (SrcLoc->getNumOperands() != 0)
if (const auto *CI =
@@ -193,7 +204,7 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V
// Only include names that correspond to user variables. FIXME: we should use
// debug info if available to get the name of the user variable.
if (isa<llvm::Argument>(V) || isa<GlobalValue>(V))
- Val = GlobalValue::getRealLinkageName(V->getName());
+ Val = GlobalValue::dropLLVMManglingEscape(V->getName());
else if (isa<Constant>(V)) {
raw_string_ostream OS(Val);
V->printAsOperand(OS, /*PrintType=*/false);
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 58c060550322..16a9e51b8306 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -1,4 +1,4 @@
-//===-- Function.cpp - Implement the Global object classes ----------------===//
+//===- Function.cpp - Implement the Global object classes -----------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,21 +11,51 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/IR/Function.h"
#include "LLVMContextImpl.h"
#include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
using namespace llvm;
// Explicit instantiations of SymbolTableListTraits since some of the methods
@@ -36,7 +66,7 @@ template class llvm::SymbolTableListTraits<BasicBlock>;
// Argument Implementation
//===----------------------------------------------------------------------===//
-void Argument::anchor() { }
+void Argument::anchor() {}
Argument::Argument(Type *Ty, const Twine &Name, Function *Par, unsigned ArgNo)
: Value(Ty, Value::ArgumentVal), Parent(Par), ArgNo(ArgNo) {
@@ -186,7 +216,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
Module *ParentModule)
: GlobalObject(Ty, Value::FunctionVal,
OperandTraits<Function>::op_begin(this), 0, Linkage, name),
- Arguments(nullptr), NumArgs(Ty->getNumParams()) {
+ NumArgs(Ty->getNumParams()) {
assert(FunctionType::isValidReturnType(getReturnType()) &&
"invalid return type");
setGlobalObjectSubClassData(0);
@@ -386,24 +416,20 @@ void Function::clearGC() {
/// Copy all additional attributes (those not needed to create a Function) from
/// the Function Src to this one.
-void Function::copyAttributesFrom(const GlobalValue *Src) {
+void Function::copyAttributesFrom(const Function *Src) {
GlobalObject::copyAttributesFrom(Src);
- const Function *SrcF = dyn_cast<Function>(Src);
- if (!SrcF)
- return;
-
- setCallingConv(SrcF->getCallingConv());
- setAttributes(SrcF->getAttributes());
- if (SrcF->hasGC())
- setGC(SrcF->getGC());
+ setCallingConv(Src->getCallingConv());
+ setAttributes(Src->getAttributes());
+ if (Src->hasGC())
+ setGC(Src->getGC());
else
clearGC();
- if (SrcF->hasPersonalityFn())
- setPersonalityFn(SrcF->getPersonalityFn());
- if (SrcF->hasPrefixData())
- setPrefixData(SrcF->getPrefixData());
- if (SrcF->hasPrologueData())
- setPrologueData(SrcF->getPrologueData());
+ if (Src->hasPersonalityFn())
+ setPersonalityFn(Src->getPersonalityFn());
+ if (Src->hasPrefixData())
+ setPrefixData(Src->getPrefixData());
+ if (Src->hasPrologueData())
+ setPrologueData(Src->getPrologueData());
}
/// Table of string intrinsic names indexed by enum value.
@@ -486,10 +512,10 @@ void Function::recalculateIntrinsicID() {
static std::string getMangledTypeStr(Type* Ty) {
std::string Result;
if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
- Result += "p" + llvm::utostr(PTyp->getAddressSpace()) +
+ Result += "p" + utostr(PTyp->getAddressSpace()) +
getMangledTypeStr(PTyp->getElementType());
} else if (ArrayType* ATyp = dyn_cast<ArrayType>(Ty)) {
- Result += "a" + llvm::utostr(ATyp->getNumElements()) +
+ Result += "a" + utostr(ATyp->getNumElements()) +
getMangledTypeStr(ATyp->getElementType());
} else if (StructType *STyp = dyn_cast<StructType>(Ty)) {
if (!STyp->isLiteral()) {
@@ -534,7 +560,6 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
return Result;
}
-
/// IIT_Info - These are enumerators that describe the entries returned by the
/// getIntrinsicInfoTableEntries function.
///
@@ -585,9 +610,10 @@ enum IIT_Info {
static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
+ using namespace Intrinsic;
+
IIT_Info Info = IIT_Info(Infos[NextElt++]);
unsigned StructElts = 2;
- using namespace Intrinsic;
switch (Info) {
case IIT_Done:
@@ -742,7 +768,6 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
llvm_unreachable("unhandled");
}
-
#define GET_INTRINSIC_GENERATOR_GLOBAL
#include "llvm/IR/Intrinsics.gen"
#undef GET_INTRINSIC_GENERATOR_GLOBAL
@@ -780,10 +805,10 @@ void Intrinsic::getIntrinsicInfoTableEntries(ID id,
DecodeIITType(NextElt, IITEntries, T);
}
-
static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
ArrayRef<Type*> Tys, LLVMContext &Context) {
using namespace Intrinsic;
+
IITDescriptor D = Infos.front();
Infos = Infos.slice(1);
@@ -855,12 +880,10 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
case IITDescriptor::VecOfAnyPtrsToElt:
// Return the overloaded type (which determines the pointers address space)
return Tys[D.getOverloadArgNumber()];
- }
+ }
llvm_unreachable("unhandled");
}
-
-
FunctionType *Intrinsic::getType(LLVMContext &Context,
ID id, ArrayRef<Type*> Tys) {
SmallVector<IITDescriptor, 8> Table;
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 5f338f58d940..17d27b016cf2 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -69,6 +69,30 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
setDLLStorageClass(Src->getDLLStorageClass());
}
+void GlobalValue::removeFromParent() {
+ switch (getValueID()) {
+#define HANDLE_GLOBAL_VALUE(NAME) \
+ case Value::NAME##Val: \
+ return static_cast<NAME *>(this)->removeFromParent();
+#include "llvm/IR/Value.def"
+ default:
+ break;
+ }
+ llvm_unreachable("not a global");
+}
+
+void GlobalValue::eraseFromParent() {
+ switch (getValueID()) {
+#define HANDLE_GLOBAL_VALUE(NAME) \
+ case Value::NAME##Val: \
+ return static_cast<NAME *>(this)->eraseFromParent();
+#include "llvm/IR/Value.def"
+ default:
+ break;
+ }
+ llvm_unreachable("not a global");
+}
+
unsigned GlobalValue::getAlignment() const {
if (auto *GA = dyn_cast<GlobalAlias>(this)) {
// In general we cannot compute this at the IR level, but we try.
@@ -93,12 +117,10 @@ void GlobalObject::setAlignment(unsigned Align) {
assert(getAlignment() == Align && "Alignment representation error!");
}
-void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
+void GlobalObject::copyAttributesFrom(const GlobalObject *Src) {
GlobalValue::copyAttributesFrom(Src);
- if (const auto *GV = dyn_cast<GlobalObject>(Src)) {
- setAlignment(GV->getAlignment());
- setSection(GV->getSection());
- }
+ setAlignment(Src->getAlignment());
+ setSection(Src->getSection());
}
std::string GlobalValue::getGlobalIdentifier(StringRef Name,
@@ -233,7 +255,7 @@ bool GlobalValue::canIncreaseAlignment() const {
const GlobalObject *GlobalValue::getBaseObject() const {
if (auto *GO = dyn_cast<GlobalObject>(this))
return GO;
- if (auto *GA = dyn_cast<GlobalAlias>(this))
+ if (auto *GA = dyn_cast<GlobalIndirectSymbol>(this))
return GA->getBaseObject();
return nullptr;
}
@@ -333,12 +355,11 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
/// Copy all additional attributes (those not needed to create a GlobalVariable)
/// from the GlobalVariable Src to this one.
-void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
+void GlobalVariable::copyAttributesFrom(const GlobalVariable *Src) {
GlobalObject::copyAttributesFrom(Src);
- if (const GlobalVariable *SrcVar = dyn_cast<GlobalVariable>(Src)) {
- setThreadLocalMode(SrcVar->getThreadLocalMode());
- setExternallyInitialized(SrcVar->isExternallyInitialized());
- }
+ setThreadLocalMode(Src->getThreadLocalMode());
+ setExternallyInitialized(Src->isExternallyInitialized());
+ setAttributes(Src->getAttributes());
}
void GlobalVariable::dropAllReferences() {
diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index e265a823687f..3477c087967f 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@@ -161,6 +161,94 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
return CI;
}
+static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
+ Value *Src) {
+ Module *M = Builder->GetInsertBlock()->getParent()->getParent();
+ Value *Ops[] = {Src};
+ Type *Tys[] = { Src->getType()->getVectorElementType(), Src->getType() };
+ auto Decl = Intrinsic::getDeclaration(M, ID, Tys);
+ return createCallHelper(Decl, Ops, Builder);
+}
+
+CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
+ Module *M = GetInsertBlock()->getParent()->getParent();
+ Value *Ops[] = {Acc, Src};
+ Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+ Src->getType()};
+ auto Decl = Intrinsic::getDeclaration(
+ M, Intrinsic::experimental_vector_reduce_fadd, Tys);
+ return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
+ Module *M = GetInsertBlock()->getParent()->getParent();
+ Value *Ops[] = {Acc, Src};
+ Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+ Src->getType()};
+ auto Decl = Intrinsic::getDeclaration(
+ M, Intrinsic::experimental_vector_reduce_fmul, Tys);
+ return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
+ auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax
+ : Intrinsic::experimental_vector_reduce_umax;
+ return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
+ auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin
+ : Intrinsic::experimental_vector_reduce_umin;
+ return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
+ auto Rdx = getReductionIntrinsic(
+ this, Intrinsic::experimental_vector_reduce_fmax, Src);
+ if (NoNaN) {
+ FastMathFlags FMF;
+ FMF.setNoNaNs();
+ Rdx->setFastMathFlags(FMF);
+ }
+ return Rdx;
+}
+
+CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) {
+ auto Rdx = getReductionIntrinsic(
+ this, Intrinsic::experimental_vector_reduce_fmin, Src);
+ if (NoNaN) {
+ FastMathFlags FMF;
+ FMF.setNoNaNs();
+ Rdx->setFastMathFlags(FMF);
+ }
+ return Rdx;
+}
+
CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
assert(isa<PointerType>(Ptr->getType()) &&
"lifetime.start only applies to pointers.");
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 906a28a5c887..91b9d9232b54 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -534,6 +534,30 @@ bool Instruction::isAtomic() const {
}
}
+bool Instruction::hasAtomicLoad() const {
+ assert(isAtomic());
+ switch (getOpcode()) {
+ default:
+ return false;
+ case Instruction::AtomicCmpXchg:
+ case Instruction::AtomicRMW:
+ case Instruction::Load:
+ return true;
+ }
+}
+
+bool Instruction::hasAtomicStore() const {
+ assert(isAtomic());
+ switch (getOpcode()) {
+ default:
+ return false;
+ case Instruction::AtomicCmpXchg:
+ case Instruction::AtomicRMW:
+ case Instruction::Store:
+ return true;
+ }
+}
+
bool Instruction::mayThrow() const {
if (const CallInst *CI = dyn_cast<CallInst>(this))
return !CI->doesNotThrow();
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index a60cc375d568..5a5b9c0d06bb 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -1,4 +1,4 @@
-//===-- Instructions.cpp - Implement the LLVM instructions ----------------===//
+//===- Instructions.cpp - Implement the LLVM instructions -----------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,18 +12,36 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/IR/Instructions.h"
#include "LLVMContextImpl.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
-#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
using namespace llvm;
//===----------------------------------------------------------------------===//
@@ -42,7 +60,42 @@ User::op_iterator CallSite::getCallee() const {
//===----------------------------------------------------------------------===//
// Out of line virtual method, so the vtable, etc has a home.
-TerminatorInst::~TerminatorInst() {
+TerminatorInst::~TerminatorInst() = default;
+
+unsigned TerminatorInst::getNumSuccessors() const {
+ switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS) \
+ case Instruction::OPC: \
+ return static_cast<const CLASS *>(this)->getNumSuccessorsV();
+#include "llvm/IR/Instruction.def"
+ default:
+ break;
+ }
+ llvm_unreachable("not a terminator");
+}
+
+BasicBlock *TerminatorInst::getSuccessor(unsigned idx) const {
+ switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS) \
+ case Instruction::OPC: \
+ return static_cast<const CLASS *>(this)->getSuccessorV(idx);
+#include "llvm/IR/Instruction.def"
+ default:
+ break;
+ }
+ llvm_unreachable("not a terminator");
+}
+
+void TerminatorInst::setSuccessor(unsigned idx, BasicBlock *B) {
+ switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS) \
+ case Instruction::OPC: \
+ return static_cast<CLASS *>(this)->setSuccessorV(idx, B);
+#include "llvm/IR/Instruction.def"
+ default:
+ break;
+ }
+ llvm_unreachable("not a terminator");
}
//===----------------------------------------------------------------------===//
@@ -50,8 +103,7 @@ TerminatorInst::~TerminatorInst() {
//===----------------------------------------------------------------------===//
// Out of line virtual method, so the vtable, etc has a home.
-UnaryInstruction::~UnaryInstruction() {
-}
+UnaryInstruction::~UnaryInstruction() = default;
//===----------------------------------------------------------------------===//
// SelectInst Class
@@ -82,7 +134,6 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
return nullptr;
}
-
//===----------------------------------------------------------------------===//
// PHINode Class
//===----------------------------------------------------------------------===//
@@ -242,8 +293,7 @@ void LandingPadInst::addClause(Constant *Val) {
// CallInst Implementation
//===----------------------------------------------------------------------===//
-CallInst::~CallInst() {
-}
+CallInst::~CallInst() = default;
void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
@@ -541,7 +591,6 @@ Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
ArraySize, OpB, MallocF, Name);
}
-
/// CreateMalloc - Generate the IR for a call to malloc:
/// 1. Compute the malloc call's argument as the specified type's size,
/// possibly multiplied by the array size if the array size is not
@@ -692,9 +741,11 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
BasicBlock *InvokeInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
+
unsigned InvokeInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
return setSuccessor(idx, B);
}
@@ -821,6 +872,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, Instruction *InsertBefore)
if (retVal)
Op<0>() = retVal;
}
+
ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
: TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
@@ -828,6 +880,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
if (retVal)
Op<0>() = retVal;
}
+
ReturnInst::ReturnInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
: TerminatorInst(Type::getVoidTy(Context), Instruction::Ret,
OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {
@@ -847,8 +900,7 @@ BasicBlock *ReturnInst::getSuccessorV(unsigned idx) const {
llvm_unreachable("ReturnInst has no successors!");
}
-ReturnInst::~ReturnInst() {
-}
+ReturnInst::~ReturnInst() = default;
//===----------------------------------------------------------------------===//
// ResumeInst Implementation
@@ -930,9 +982,11 @@ BasicBlock *CleanupReturnInst::getSuccessorV(unsigned Idx) const {
assert(Idx == 0);
return getUnwindDest();
}
+
unsigned CleanupReturnInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void CleanupReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
assert(Idx == 0);
setUnwindDest(B);
@@ -973,9 +1027,11 @@ BasicBlock *CatchReturnInst::getSuccessorV(unsigned Idx) const {
assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
return getSuccessor();
}
+
unsigned CatchReturnInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void CatchReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
setSuccessor(B);
@@ -1067,9 +1123,11 @@ void CatchSwitchInst::removeHandler(handler_iterator HI) {
BasicBlock *CatchSwitchInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
+
unsigned CatchSwitchInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void CatchSwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
setSuccessor(idx, B);
}
@@ -1155,6 +1213,7 @@ BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
assert(IfTrue && "Branch destination may not be null!");
Op<-1>() = IfTrue;
}
+
BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
Instruction *InsertBefore)
: TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
@@ -1189,7 +1248,6 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
#endif
}
-
BranchInst::BranchInst(const BranchInst &BI) :
TerminatorInst(Type::getVoidTy(BI.getContext()), Instruction::Br,
OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
@@ -1216,14 +1274,15 @@ void BranchInst::swapSuccessors() {
BasicBlock *BranchInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
+
unsigned BranchInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void BranchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
setSuccessor(idx, B);
}
-
//===----------------------------------------------------------------------===//
// AllocaInst Implementation
//===----------------------------------------------------------------------===//
@@ -1279,8 +1338,7 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
}
// Out of line virtual method, so the vtable, etc has a home.
-AllocaInst::~AllocaInst() {
-}
+AllocaInst::~AllocaInst() = default;
void AllocaInst::setAlignment(unsigned Align) {
assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
@@ -1543,8 +1601,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
SynchronizationScope SynchScope,
Instruction *InsertBefore)
: Instruction(
- StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
- nullptr),
+ StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
@@ -1556,8 +1613,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
SynchronizationScope SynchScope,
BasicBlock *InsertAtEnd)
: Instruction(
- StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
- nullptr),
+ StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
OperandTraits<AtomicCmpXchgInst>::operands(this), InsertAtEnd) {
Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
@@ -1771,14 +1827,12 @@ ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
setName(Name);
}
-
bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
if (!Val->getType()->isVectorTy() || !Index->getType()->isIntegerTy())
return false;
return true;
}
-
//===----------------------------------------------------------------------===//
// InsertElementInst Implementation
//===----------------------------------------------------------------------===//
@@ -1825,7 +1879,6 @@ bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt,
return true;
}
-
//===----------------------------------------------------------------------===//
// ShuffleVectorInst Implementation
//===----------------------------------------------------------------------===//
@@ -1938,7 +1991,6 @@ void ShuffleVectorInst::getShuffleMask(Constant *Mask,
}
}
-
//===----------------------------------------------------------------------===//
// InsertValueInst Class
//===----------------------------------------------------------------------===//
@@ -1951,7 +2003,7 @@ void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
// (other than weirdness with &*IdxBegin being invalid; see
// getelementptr's init routine for example). But there's no
// present need to support it.
- assert(Idxs.size() > 0 && "InsertValueInst must have at least one index");
+ assert(!Idxs.empty() && "InsertValueInst must have at least one index");
assert(ExtractValueInst::getIndexedType(Agg->getType(), Idxs) ==
Val->getType() && "Inserted value must match indexed type!");
@@ -1980,7 +2032,7 @@ void ExtractValueInst::init(ArrayRef<unsigned> Idxs, const Twine &Name) {
// There's no fundamental reason why we require at least one index.
// But there's no present need to support it.
- assert(Idxs.size() > 0 && "ExtractValueInst must have at least one index");
+ assert(!Idxs.empty() && "ExtractValueInst must have at least one index");
Indices.append(Idxs.begin(), Idxs.end());
setName(Name);
@@ -2053,7 +2105,6 @@ BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
setName(Name);
}
-
void BinaryOperator::init(BinaryOps iType) {
Value *LHS = getOperand(0), *RHS = getOperand(1);
(void)LHS; (void)RHS; // Silence warnings.
@@ -2213,7 +2264,6 @@ BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
Op->getType(), Name, InsertAtEnd);
}
-
// isConstantAllOnes - Helper function for several functions below
static inline bool isConstantAllOnes(const Value *V) {
if (const Constant *C = dyn_cast<Constant>(V))
@@ -2279,7 +2329,6 @@ const Value *BinaryOperator::getNotArgument(const Value *BinOp) {
return getNotArgument(const_cast<Value*>(BinOp));
}
-
// Exchange the two operands to this instruction. This instruction is safe to
// use on any binary instruction and does not modify the semantics of the
// instruction. If the instruction is order-dependent (SetLT f.e.), the opcode
@@ -2291,7 +2340,6 @@ bool BinaryOperator::swapOperands() {
return false;
}
-
//===----------------------------------------------------------------------===//
// FPMathOperator Class
//===----------------------------------------------------------------------===//
@@ -2305,7 +2353,6 @@ float FPMathOperator::getFPAccuracy() const {
return Accuracy->getValueAPF().convertToFloat();
}
-
//===----------------------------------------------------------------------===//
// CastInst Class
//===----------------------------------------------------------------------===//
@@ -2567,13 +2614,12 @@ unsigned CastInst::isEliminableCastPair(
return Instruction::BitCast;
return 0;
}
- case 12: {
+ case 12:
// addrspacecast, addrspacecast -> bitcast, if SrcAS == DstAS
// addrspacecast, addrspacecast -> addrspacecast, if SrcAS != DstAS
if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
return Instruction::AddrSpaceCast;
return Instruction::BitCast;
- }
case 13:
// FIXME: this state can be merged with (1), but the following assert
// is useful to check the correcteness of the sequence due to semantic
@@ -2594,7 +2640,6 @@ unsigned CastInst::isEliminableCastPair(
DstTy->getScalarType()->getPointerElementType())
return Instruction::AddrSpaceCast;
return 0;
-
case 15:
// FIXME: this state can be merged with (1), but the following assert
// is useful to check the correcteness of the sequence due to semantic
@@ -3070,7 +3115,6 @@ CastInst::getCastOpcode(
/// of the types involved.
bool
CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
-
// Check for type sanity on the arguments
Type *SrcTy = S->getType();
@@ -3419,7 +3463,6 @@ bool CmpInst::isEquality() const {
return cast<FCmpInst>(this)->isEquality();
}
-
CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
switch (pred) {
default: llvm_unreachable("Unknown cmp predicate!");
@@ -3743,9 +3786,11 @@ void SwitchInst::growOperands() {
BasicBlock *SwitchInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
+
unsigned SwitchInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void SwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
setSuccessor(idx, B);
}
@@ -3832,9 +3877,11 @@ void IndirectBrInst::removeDestination(unsigned idx) {
BasicBlock *IndirectBrInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
+
unsigned IndirectBrInst::getNumSuccessorsV() const {
return getNumSuccessors();
}
+
void IndirectBrInst::setSuccessorV(unsigned idx, BasicBlock *B) {
setSuccessor(idx, B);
}
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 628a67bd639c..b2b12289f871 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManagers.h"
@@ -465,6 +466,11 @@ public:
// null. It may be called multiple times.
static void createTheTimeInfo();
+ // print - Prints out timing information and then resets the timers.
+ void print() {
+ TG.print(*CreateInfoOutputFile());
+ }
+
/// getPassTimer - Return the timer for the specified pass if it exists.
Timer *getPassTimer(Pass *P) {
if (P->getAsPMDataManager())
@@ -1752,6 +1758,13 @@ Timer *llvm::getPassTimer(Pass *P) {
return nullptr;
}
+/// If timing is enabled, report the times collected up to now and then reset
+/// them.
+void llvm::reportAndResetTimings() {
+ if (TheTimeInfo)
+ TheTimeInfo->print();
+}
+
//===----------------------------------------------------------------------===//
// PMStack implementation
//
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index fec9df193685..12c258d95f52 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -1,4 +1,4 @@
-//===-- Module.cpp - Implement the Module class ---------------------------===//
+//===- Module.cpp - Implement the Module class ----------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,27 +11,46 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/IR/Module.h"
#include "SymbolTableListTraitsImpl.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Comdat.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/GVMaterializer.h"
-#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Type.h"
#include "llvm/IR/TypeFinder.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/RandomNumberGenerator.h"
#include <algorithm>
-#include <cstdarg>
-#include <cstdlib>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
using namespace llvm;
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index b67b0a307861..c9f957c244f8 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -1,4 +1,4 @@
-//===-- Type.cpp - Implement the Type class -------------------------------===//
+//===- Type.cpp - Implement the Type class --------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,12 +11,25 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/IR/Type.h"
#include "LLVMContextImpl.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
-#include <algorithm>
-#include <cstdarg>
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+
using namespace llvm;
//===----------------------------------------------------------------------===//
@@ -220,7 +233,6 @@ PointerType *Type::getInt64PtrTy(LLVMContext &C, unsigned AS) {
return getInt64Ty(C)->getPointerTo(AS);
}
-
//===----------------------------------------------------------------------===//
// IntegerType Implementation
//===----------------------------------------------------------------------===//
@@ -362,7 +374,8 @@ void StructType::setName(StringRef Name) {
if (Name == getName()) return;
StringMap<StructType *> &SymbolTable = getContext().pImpl->NamedStructTypes;
- typedef StringMap<StructType *>::MapEntryTy EntryTy;
+
+ using EntryTy = StringMap<StructType *>::MapEntryTy;
// If this struct already had a name, remove its symbol table entry. Don't
// delete the data yet because it may be part of the new name.
@@ -419,21 +432,6 @@ StructType *StructType::get(LLVMContext &Context, bool isPacked) {
return get(Context, None, isPacked);
}
-StructType *StructType::get(Type *type, ...) {
- assert(type && "Cannot create a struct type with no elements with this");
- LLVMContext &Ctx = type->getContext();
- va_list ap;
- SmallVector<llvm::Type*, 8> StructFields;
- va_start(ap, type);
- while (type) {
- StructFields.push_back(type);
- type = va_arg(ap, llvm::Type*);
- }
- auto *Ret = llvm::StructType::get(Ctx, StructFields);
- va_end(ap);
- return Ret;
-}
-
StructType *StructType::create(LLVMContext &Context, ArrayRef<Type*> Elements,
StringRef Name, bool isPacked) {
StructType *ST = create(Context, Name);
@@ -462,21 +460,6 @@ StructType *StructType::create(ArrayRef<Type*> Elements) {
return create(Elements[0]->getContext(), Elements, StringRef());
}
-StructType *StructType::create(StringRef Name, Type *type, ...) {
- assert(type && "Cannot create a struct type with no elements with this");
- LLVMContext &Ctx = type->getContext();
- va_list ap;
- SmallVector<llvm::Type*, 8> StructFields;
- va_start(ap, type);
- while (type) {
- StructFields.push_back(type);
- type = va_arg(ap, llvm::Type*);
- }
- auto *Ret = llvm::StructType::create(Ctx, StructFields, Name);
- va_end(ap);
- return Ret;
-}
-
bool StructType::isSized(SmallPtrSetImpl<Type*> *Visited) const {
if ((getSubclassData() & SCDB_IsSized) != 0)
return true;
@@ -508,19 +491,6 @@ StringRef StructType::getName() const {
return ((StringMapEntry<StructType*> *)SymbolTableEntry)->getKey();
}
-void StructType::setBody(Type *type, ...) {
- assert(type && "Cannot create a struct type with no elements with this");
- va_list ap;
- SmallVector<llvm::Type*, 8> StructFields;
- va_start(ap, type);
- while (type) {
- StructFields.push_back(type);
- type = va_arg(ap, llvm::Type*);
- }
- setBody(StructFields);
- va_end(ap);
-}
-
bool StructType::isValidElementType(Type *ElemTy) {
return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
!ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
@@ -540,7 +510,6 @@ StructType *Module::getTypeByName(StringRef Name) const {
return getContext().pImpl->NamedStructTypes.lookup(Name);
}
-
//===----------------------------------------------------------------------===//
// CompositeType Implementation
//===----------------------------------------------------------------------===//
@@ -589,7 +558,6 @@ bool CompositeType::indexValid(unsigned Idx) const {
return true;
}
-
//===----------------------------------------------------------------------===//
// ArrayType Implementation
//===----------------------------------------------------------------------===//
@@ -661,7 +629,6 @@ PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
return Entry;
}
-
PointerType::PointerType(Type *E, unsigned AddrSpace)
: Type(E->getContext(), PointerTyID), PointeeTy(E) {
ContainedTys = &PointeeTy;
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 65e124562493..3b68d6365872 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -267,6 +267,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
/// \brief Keep track of the metadata nodes that have been checked already.
SmallPtrSet<const Metadata *, 32> MDNodes;
+ /// Keep track which DISubprogram is attached to which function.
+ DenseMap<const DISubprogram *, const Function *> DISubprogramAttachments;
+
/// Track all DICompileUnits visited.
SmallPtrSet<const Metadata *, 2> CUVisited;
@@ -386,7 +389,7 @@ public:
verifyCompileUnits();
verifyDeoptimizeCallingConvs();
-
+ DISubprogramAttachments.clear();
return !Broken;
}
@@ -2085,13 +2088,19 @@ void Verifier::visitFunction(const Function &F) {
switch (I.first) {
default:
break;
- case LLVMContext::MD_dbg:
+ case LLVMContext::MD_dbg: {
++NumDebugAttachments;
AssertDI(NumDebugAttachments == 1,
"function must have a single !dbg attachment", &F, I.second);
AssertDI(isa<DISubprogram>(I.second),
"function !dbg attachment must be a subprogram", &F, I.second);
+ auto *SP = cast<DISubprogram>(I.second);
+ const Function *&AttachedTo = DISubprogramAttachments[SP];
+ AssertDI(!AttachedTo || AttachedTo == &F,
+ "DISubprogram attached to more than one function", SP, &F);
+ AttachedTo = &F;
break;
+ }
case LLVMContext::MD_prof:
++NumProfAttachments;
Assert(NumProfAttachments == 1,
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index 684b378c93e5..89ddd0fc1af3 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -24,7 +24,6 @@ subdirectories =
DebugInfo
Demangle
ExecutionEngine
- LibDriver
LineEditor
Linker
IR
@@ -39,6 +38,7 @@ subdirectories =
Support
TableGen
Target
+ ToolDrivers
Transforms
[component_0]
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 2d2dcdec05fb..c73b6b6b15c1 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -973,7 +973,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
// this value. If not, no need to preserve any ThinLTO copies.
!Res.second.IRName.empty())
GUIDPreservedSymbols.insert(GlobalValue::getGUID(
- GlobalValue::getRealLinkageName(Res.second.IRName)));
+ GlobalValue::dropLLVMManglingEscape(Res.second.IRName)));
}
auto DeadSymbols =
@@ -993,7 +993,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
if (Res.second.IRName.empty())
continue;
auto GUID = GlobalValue::getGUID(
- GlobalValue::getRealLinkageName(Res.second.IRName));
+ GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
// Mark exported unless index-based analysis determined it to be dead.
if (!DeadSymbols.count(GUID))
ExportedGUIDs.insert(GUID);
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 86fba843e980..6a275560dc92 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -495,17 +495,14 @@ void LTOCodeGenerator::verifyMergedModuleOnce() {
return;
HasVerifiedInput = true;
- if (LTOStripInvalidDebugInfo) {
- bool BrokenDebugInfo = false;
- if (verifyModule(*MergedModule, &dbgs(), &BrokenDebugInfo))
- report_fatal_error("Broken module found, compilation aborted!");
- if (BrokenDebugInfo) {
- emitWarning("Invalid debug info found, debug info will be stripped");
- StripDebugInfo(*MergedModule);
- }
- }
- if (verifyModule(*MergedModule, &dbgs()))
+ bool BrokenDebugInfo = false;
+ if (verifyModule(*MergedModule, &dbgs(),
+ LTOStripInvalidDebugInfo ? &BrokenDebugInfo : nullptr))
report_fatal_error("Broken module found, compilation aborted!");
+ if (BrokenDebugInfo) {
+ emitWarning("Invalid debug info found, debug info will be stripped");
+ StripDebugInfo(*MergedModule);
+ }
}
void LTOCodeGenerator::finishOptimizationRemarks() {
@@ -600,6 +597,7 @@ bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> Out) {
// If statistics were requested, print them out after codegen.
if (llvm::AreStatisticsEnabled())
llvm::PrintStatistics();
+ reportAndResetTimings();
finishOptimizationRemarks();
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index b4ee7c2b2fbc..65a7994325bc 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -446,7 +446,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
{
raw_svector_ostream OS(OutputBuffer);
ProfileSummaryInfo PSI(TheModule);
- auto Index = buildModuleSummaryIndex(TheModule, nullptr, nullptr);
+ auto Index = buildModuleSummaryIndex(TheModule, nullptr, &PSI);
WriteBitcodeToFile(&TheModule, OS, true, &Index);
}
return make_unique<ObjectMemoryBuffer>(std::move(OutputBuffer));
@@ -1024,4 +1024,5 @@ void ThinLTOCodeGenerator::run() {
// If statistics were requested, print them out now.
if (llvm::AreStatisticsEnabled())
llvm::PrintStatistics();
+ reportAndResetTimings();
}
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index 15a46a2d0420..ecef1efda1a2 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -602,6 +602,7 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
/*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
SGVar->getType()->getAddressSpace());
NewDGV->setAlignment(SGVar->getAlignment());
+ NewDGV->copyAttributesFrom(SGVar);
return NewDGV;
}
@@ -610,8 +611,11 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
Function *IRLinker::copyFunctionProto(const Function *SF) {
// If there is no linkage to be performed or we are linking from the source,
// bring SF over.
- return Function::Create(TypeMap.get(SF->getFunctionType()),
- GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+ auto *F =
+ Function::Create(TypeMap.get(SF->getFunctionType()),
+ GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+ F->copyAttributesFrom(SF);
+ return F;
}
/// Set up prototypes for any aliases that come over from the source module.
@@ -619,9 +623,11 @@ GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
// If there is no linkage to be performed or we're linking from the source,
// bring over SGA.
auto *Ty = TypeMap.get(SGA->getValueType());
- return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
- GlobalValue::ExternalLinkage, SGA->getName(),
- &DstM);
+ auto *GA =
+ GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
+ GlobalValue::ExternalLinkage, SGA->getName(), &DstM);
+ GA->copyAttributesFrom(SGA);
+ return GA;
}
GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
@@ -648,8 +654,6 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
else if (SGV->hasExternalWeakLinkage())
NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
- NewGV->copyAttributesFrom(SGV);
-
if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
// Metadata for global variables and function declarations is copied eagerly.
if (isa<GlobalVariable>(SGV) || SGV->isDeclaration())
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index f7f2253256eb..174397e27396 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -133,6 +133,11 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
// Avoid fixups when possible.
int64_t AbsValue;
if (Value->evaluateAsAbsolute(AbsValue, getAssembler())) {
+ if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) {
+ getContext().reportError(
+ Loc, "value evaluated as " + Twine(AbsValue) + " is out of range.");
+ return;
+ }
EmitIntValue(AbsValue, Size);
return;
}
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 66ba853da2fe..3b213ef4ce09 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -288,6 +288,7 @@ public:
private:
bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc);
+ void altMacroString(StringRef AltMacroStr, std::string &Res);
bool parseStatement(ParseStatementInfo &Info,
MCAsmParserSemaCallback *SI);
bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
@@ -1209,6 +1210,8 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
const char *CharPtr = StrLoc.getPointer();
while ((*CharPtr != '>') && (*CharPtr != '\n') &&
(*CharPtr != '\r') && (*CharPtr != '\0')){
+ if(*CharPtr == '!')
+ CharPtr++;
CharPtr++;
}
if (*CharPtr == '>') {
@@ -1218,6 +1221,15 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
return false;
}
+/// \brief creating a string without the escape characters '!'.
+void AsmParser::altMacroString(StringRef AltMacroStr,std::string &Res) {
+ for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
+ if (AltMacroStr[Pos] == '!')
+ Pos++;
+ Res += AltMacroStr[Pos];
+ }
+}
+
/// \brief Parse an expression and return it.
///
/// expr ::= expr &&,|| expr -> lowest.
@@ -2309,6 +2321,15 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
(*(Token.getString().begin()) == '%') && Token.is(AsmToken::Integer))
// Emit an integer value to the buffer.
OS << Token.getIntVal();
+ // Only Token that was validated as a string and begins with '<'
+ // is considered altMacroString!!!
+ else if ((Lexer.IsaAltMacroMode()) &&
+ (*(Token.getString().begin()) == '<') &&
+ Token.is(AsmToken::String)) {
+ std::string Res;
+ altMacroString(Token.getStringContents(), Res);
+ OS << Res;
+ }
// We expect no quotes around the string's contents when
// parsing for varargs.
else if (Token.isNot(AsmToken::String) || VarargParameter)
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index b1223e81be43..28531feccfe1 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -1062,7 +1062,7 @@ COFFObjectFile::getSectionContents(const coff_section *Sec,
// In COFF, a virtual section won't have any in-file
// content, so the file pointer to the content will be zero.
if (Sec->PointerToRawData == 0)
- return object_error::parse_failed;
+ return std::error_code();
// The only thing that we need to verify is that the contents is contained
// within the file bounds. We don't need to make sure it doesn't cover other
// data, as there's nothing that says that is not allowed.
@@ -1602,8 +1602,6 @@ ErrorOr<ArrayRef<UTF16>> ResourceSectionRef::getDirStringAtOffset(uint32_t Offse
uint16_t Length;
RETURN_IF_ERROR(Reader.readInteger(Length));
ArrayRef<UTF16> RawDirString;
- // Strings are stored as 2-byte aligned unicode characters but readFixedString
- // assumes byte string, so we double length.
RETURN_IF_ERROR(Reader.readArray(RawDirString, Length));
return RawDirString;
}
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 39f8704aacf2..058686e4db9e 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -168,6 +168,13 @@ static wasm::WasmLimits readLimits(const uint8_t *&Ptr) {
return Result;
}
+static wasm::WasmTable readTable(const uint8_t *&Ptr) {
+ wasm::WasmTable Table;
+ Table.ElemType = readVarint7(Ptr);
+ Table.Limits = readLimits(Ptr);
+ return Table;
+}
+
static Error readSection(WasmSection &Section, const uint8_t *&Ptr,
const uint8_t *Start) {
// TODO(sbc): Avoid reading past EOF in the case of malformed files.
@@ -397,13 +404,22 @@ Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End)
Sections.size(), i);
break;
case wasm::WASM_EXTERNAL_GLOBAL:
- Im.GlobalType = readVarint7(Ptr);
- Im.GlobalMutable = readVaruint1(Ptr);
+ Im.Global.Type = readVarint7(Ptr);
+ Im.Global.Mutable = readVaruint1(Ptr);
Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT,
Sections.size(), i);
break;
+ case wasm::WASM_EXTERNAL_MEMORY:
+ Im.Memory = readLimits(Ptr);
+ break;
+ case wasm::WASM_EXTERNAL_TABLE:
+ Im.Table = readTable(Ptr);
+ if (Im.Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+ return make_error<GenericBinaryError>("Invalid table element type",
+ object_error::parse_failed);
+ }
+ break;
default:
- // TODO(sbc): Handle other kinds of imports
return make_error<GenericBinaryError>(
"Unexpected import kind", object_error::parse_failed);
}
@@ -431,14 +447,11 @@ Error WasmObjectFile::parseTableSection(const uint8_t *Ptr, const uint8_t *End)
uint32_t Count = readVaruint32(Ptr);
Tables.reserve(Count);
while (Count--) {
- wasm::WasmTable Table;
- Table.ElemType = readVarint7(Ptr);
- if (Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+ Tables.push_back(readTable(Ptr));
+ if (Tables.back().ElemType != wasm::WASM_TYPE_ANYFUNC) {
return make_error<GenericBinaryError>("Invalid table element type",
object_error::parse_failed);
}
- Table.Limits = readLimits(Ptr);
- Tables.push_back(Table);
}
if (Ptr != End)
return make_error<GenericBinaryError>("Table section ended prematurely",
@@ -493,8 +506,10 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::GLOBAL_EXPORT,
Sections.size(), i);
break;
+ case wasm::WASM_EXTERNAL_MEMORY:
+ case wasm::WASM_EXTERNAL_TABLE:
+ break;
default:
- // TODO(sbc): Handle other kinds of exports
return make_error<GenericBinaryError>(
"Unexpected export kind", object_error::parse_failed);
}
@@ -507,7 +522,7 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End)
Error WasmObjectFile::parseStartSection(const uint8_t *Ptr, const uint8_t *End) {
StartFunction = readVaruint32(Ptr);
- if (StartFunction < FunctionTypes.size())
+ if (StartFunction >= FunctionTypes.size())
return make_error<GenericBinaryError>("Invalid start function",
object_error::parse_failed);
return Error::success();
@@ -638,10 +653,14 @@ basic_symbol_iterator WasmObjectFile::symbol_end() const {
return BasicSymbolRef(Ref, this);
}
-const WasmSymbol &WasmObjectFile::getWasmSymbol(DataRefImpl Symb) const {
+const WasmSymbol &WasmObjectFile::getWasmSymbol(const DataRefImpl &Symb) const {
return Symbols[Symb.d.a];
}
+const WasmSymbol &WasmObjectFile::getWasmSymbol(const SymbolRef &Symb) const {
+ return getWasmSymbol(Symb.getRawDataRefImpl());
+}
+
Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
const WasmSymbol &Sym = getWasmSymbol(Symb);
return Sym.Name;
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index c5d1b438ee2a..910d32f16af9 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -265,8 +265,12 @@ void MappingTraits<WasmYAML::Import>::mapping(IO &IO,
if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
IO.mapRequired("SigIndex", Import.SigIndex);
} else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
- IO.mapRequired("GlobalType", Import.GlobalType);
- IO.mapRequired("GlobalMutable", Import.GlobalMutable);
+ IO.mapRequired("GlobalType", Import.GlobalImport.Type);
+ IO.mapRequired("GlobalMutable", Import.GlobalImport.Mutable);
+ } else if (Import.Kind == wasm::WASM_EXTERNAL_TABLE) {
+ IO.mapRequired("Table", Import.TableImport);
+ } else if (Import.Kind == wasm::WASM_EXTERNAL_MEMORY ) {
+ IO.mapRequired("Memory", Import.Memory);
} else {
llvm_unreachable("unhandled import type");
}
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index b91b6fb7c7ad..b05efa7417b9 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -29,6 +29,7 @@
#include <algorithm>
#include <cstdint>
#include <memory>
+#include <set>
#include <system_error>
#include <utility>
#include <vector>
@@ -36,6 +37,32 @@
using namespace llvm;
using namespace sampleprof;
+std::error_code
+SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
+ if (std::error_code EC = writeHeader(ProfileMap))
+ return EC;
+
+ // Sort the ProfileMap by total samples.
+ typedef std::pair<StringRef, const FunctionSamples *> NameFunctionSamples;
+ std::vector<NameFunctionSamples> V;
+ for (const auto &I : ProfileMap)
+ V.push_back(std::make_pair(I.getKey(), &I.second));
+
+ std::stable_sort(
+ V.begin(), V.end(),
+ [](const NameFunctionSamples &A, const NameFunctionSamples &B) {
+ if (A.second->getTotalSamples() == B.second->getTotalSamples())
+ return A.first > B.first;
+ return A.second->getTotalSamples() > B.second->getTotalSamples();
+ });
+
+ for (const auto &I : V) {
+ if (std::error_code EC = write(*I.second))
+ return EC;
+ }
+ return sampleprof_error::success;
+}
+
/// \brief Write samples to a text file.
///
/// Note: it may be tempting to implement this in terms of
@@ -97,8 +124,7 @@ std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) {
}
void SampleProfileWriterBinary::addName(StringRef FName) {
- auto NextIdx = NameTable.size();
- NameTable.insert(std::make_pair(FName, NextIdx));
+ NameTable.insert(std::make_pair(FName, 0));
}
void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
@@ -136,10 +162,18 @@ std::error_code SampleProfileWriterBinary::writeHeader(
addNames(I.second);
}
+ // Sort the names to make NameTable is deterministic.
+ std::set<StringRef> V;
+ for (const auto &I : NameTable)
+ V.insert(I.first);
+ int i = 0;
+ for (const StringRef &N : V)
+ NameTable[N] = i++;
+
// Write out the name table.
encodeULEB128(NameTable.size(), OS);
- for (auto N : NameTable) {
- OS << N.first;
+ for (auto N : V) {
+ OS << N;
encodeULEB128(0, OS);
}
return sampleprof_error::success;
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index caa0691f9205..17144522db82 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -122,35 +122,38 @@ APInt::APInt(unsigned numbits, StringRef Str, uint8_t radix)
fromString(numbits, Str, radix);
}
+void APInt::reallocate(unsigned NewBitWidth) {
+ // If the number of words is the same we can just change the width and stop.
+ if (getNumWords() == getNumWords(NewBitWidth)) {
+ BitWidth = NewBitWidth;
+ return;
+ }
+
+ // If we have an allocation, delete it.
+ if (!isSingleWord())
+ delete [] U.pVal;
+
+ // Update BitWidth.
+ BitWidth = NewBitWidth;
+
+ // If we are supposed to have an allocation, create it.
+ if (!isSingleWord())
+ U.pVal = getMemory(getNumWords());
+}
+
void APInt::AssignSlowCase(const APInt& RHS) {
// Don't do anything for X = X
if (this == &RHS)
return;
- if (BitWidth == RHS.getBitWidth()) {
- // assume same bit-width single-word case is already handled
- assert(!isSingleWord());
- memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE);
- return;
- }
+ // Adjust the bit width and handle allocations as necessary.
+ reallocate(RHS.getBitWidth());
- if (isSingleWord()) {
- // assume case where both are single words is already handled
- assert(!RHS.isSingleWord());
- U.pVal = getMemory(RHS.getNumWords());
- memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
- } else if (getNumWords() == RHS.getNumWords())
- memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
- else if (RHS.isSingleWord()) {
- delete [] U.pVal;
+ // Copy the data.
+ if (isSingleWord())
U.VAL = RHS.U.VAL;
- } else {
- delete [] U.pVal;
- U.pVal = getMemory(RHS.getNumWords());
- memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
- }
- BitWidth = RHS.BitWidth;
- clearUnusedBits();
+ else
+ memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE);
}
/// This method 'profiles' an APInt for use with FoldingSet.
@@ -1138,10 +1141,13 @@ APInt APInt::multiplicativeInverse(const APInt& modulo) const {
return APInt(BitWidth, 0);
// The next-to-last t is the multiplicative inverse. However, we are
- // interested in a positive inverse. Calcuate a positive one from a negative
+ // interested in a positive inverse. Calculate a positive one from a negative
// one if necessary. A simple addition of the modulo suffices because
// abs(t[i]) is known to be less than *this/2 (see the link above).
- return t[i].isNegative() ? t[i] + modulo : t[i];
+ if (t[i].isNegative())
+ t[i] += modulo;
+
+ return std::move(t[i]);
}
/// Calculate the magic numbers required to implement a signed integer division
@@ -1240,7 +1246,7 @@ APInt::mu APInt::magicu(unsigned LeadingZeros) const {
/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
/// variables here have the same names as in the algorithm. Comments explain
/// the algorithm and any deviation from it.
-static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
+static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
unsigned m, unsigned n) {
assert(u && "Must provide dividend");
assert(v && "Must provide divisor");
@@ -1266,16 +1272,16 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
// overflow. Note that this can require an extra word in u so that u must
// be of length m+n+1.
unsigned shift = countLeadingZeros(v[n-1]);
- unsigned v_carry = 0;
- unsigned u_carry = 0;
+ uint32_t v_carry = 0;
+ uint32_t u_carry = 0;
if (shift) {
for (unsigned i = 0; i < m+n; ++i) {
- unsigned u_tmp = u[i] >> (32 - shift);
+ uint32_t u_tmp = u[i] >> (32 - shift);
u[i] = (u[i] << shift) | u_carry;
u_carry = u_tmp;
}
for (unsigned i = 0; i < n; ++i) {
- unsigned v_tmp = v[i] >> (32 - shift);
+ uint32_t v_tmp = v[i] >> (32 - shift);
v[i] = (v[i] << shift) | v_carry;
v_carry = v_tmp;
}
@@ -1296,11 +1302,11 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
// Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
// Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
// Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
- // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+ // qp by 1, increase rp by v[n-1], and repeat this test if rp < b. The test
// on v[n-2] determines at high speed most of the cases in which the trial
// value qp is one too large, and it eliminates all cases where qp is two
// too large.
- uint64_t dividend = ((uint64_t(u[j+n]) << 32) + u[j+n-1]);
+ uint64_t dividend = Make_64(u[j+n], u[j+n-1]);
DEBUG(dbgs() << "KnuthDiv: dividend == " << dividend << '\n');
uint64_t qp = dividend / v[n-1];
uint64_t rp = dividend % v[n-1];
@@ -1323,14 +1329,14 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
int64_t borrow = 0;
for (unsigned i = 0; i < n; ++i) {
uint64_t p = uint64_t(qp) * uint64_t(v[i]);
- int64_t subres = int64_t(u[j+i]) - borrow - (unsigned)p;
- u[j+i] = (unsigned)subres;
- borrow = (p >> 32) - (subres >> 32);
+ int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p);
+ u[j+i] = Lo_32(subres);
+ borrow = Hi_32(p) - Hi_32(subres);
DEBUG(dbgs() << "KnuthDiv: u[j+i] = " << u[j+i]
<< ", borrow = " << borrow << '\n');
}
bool isNeg = u[j+n] < borrow;
- u[j+n] -= (unsigned)borrow;
+ u[j+n] -= Lo_32(borrow);
DEBUG(dbgs() << "KnuthDiv: after subtraction:");
DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
@@ -1338,7 +1344,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
// D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
// negative, go to step D6; otherwise go on to step D7.
- q[j] = (unsigned)qp;
+ q[j] = Lo_32(qp);
if (isNeg) {
// D6. [Add back]. The probability that this step is necessary is very
// small, on the order of only 2/b. Make sure that test data accounts for
@@ -1349,7 +1355,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
// since it cancels with the borrow that occurred in D4.
bool carry = false;
for (unsigned i = 0; i < n; i++) {
- unsigned limit = std::min(u[j+i],v[i]);
+ uint32_t limit = std::min(u[j+i],v[i]);
u[j+i] += v[i] + carry;
carry = u[j+i] < limit || (carry && u[j+i] == limit);
}
@@ -1374,7 +1380,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
// multiplication by d by using a shift left. So, all we have to do is
// shift right here.
if (shift) {
- unsigned carry = 0;
+ uint32_t carry = 0;
DEBUG(dbgs() << "KnuthDiv: remainder:");
for (int i = n-1; i >= 0; i--) {
r[i] = (u[i] >> shift) | carry;
@@ -1403,17 +1409,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
// can't use 64-bit operands here because we don't have native results of
// 128-bits. Furthermore, casting the 64-bit values to 32-bit values won't
// work on large-endian machines.
- uint64_t mask = ~0ull >> (sizeof(unsigned)*CHAR_BIT);
unsigned n = rhsWords * 2;
unsigned m = (lhsWords * 2) - n;
// Allocate space for the temporary values we need either on the stack, if
// it will fit, or on the heap if it won't.
- unsigned SPACE[128];
- unsigned *U = nullptr;
- unsigned *V = nullptr;
- unsigned *Q = nullptr;
- unsigned *R = nullptr;
+ uint32_t SPACE[128];
+ uint32_t *U = nullptr;
+ uint32_t *V = nullptr;
+ uint32_t *Q = nullptr;
+ uint32_t *R = nullptr;
if ((Remainder?4:3)*n+2*m+1 <= 128) {
U = &SPACE[0];
V = &SPACE[m+n+1];
@@ -1421,34 +1426,34 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
if (Remainder)
R = &SPACE[(m+n+1) + n + (m+n)];
} else {
- U = new unsigned[m + n + 1];
- V = new unsigned[n];
- Q = new unsigned[m+n];
+ U = new uint32_t[m + n + 1];
+ V = new uint32_t[n];
+ Q = new uint32_t[m+n];
if (Remainder)
- R = new unsigned[n];
+ R = new uint32_t[n];
}
// Initialize the dividend
- memset(U, 0, (m+n+1)*sizeof(unsigned));
+ memset(U, 0, (m+n+1)*sizeof(uint32_t));
for (unsigned i = 0; i < lhsWords; ++i) {
- uint64_t tmp = (LHS.getNumWords() == 1 ? LHS.U.VAL : LHS.U.pVal[i]);
- U[i * 2] = (unsigned)(tmp & mask);
- U[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+ uint64_t tmp = LHS.getRawData()[i];
+ U[i * 2] = Lo_32(tmp);
+ U[i * 2 + 1] = Hi_32(tmp);
}
U[m+n] = 0; // this extra word is for "spill" in the Knuth algorithm.
// Initialize the divisor
- memset(V, 0, (n)*sizeof(unsigned));
+ memset(V, 0, (n)*sizeof(uint32_t));
for (unsigned i = 0; i < rhsWords; ++i) {
- uint64_t tmp = (RHS.getNumWords() == 1 ? RHS.U.VAL : RHS.U.pVal[i]);
- V[i * 2] = (unsigned)(tmp & mask);
- V[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+ uint64_t tmp = RHS.getRawData()[i];
+ V[i * 2] = Lo_32(tmp);
+ V[i * 2 + 1] = Hi_32(tmp);
}
// initialize the quotient and remainder
- memset(Q, 0, (m+n) * sizeof(unsigned));
+ memset(Q, 0, (m+n) * sizeof(uint32_t));
if (Remainder)
- memset(R, 0, n * sizeof(unsigned));
+ memset(R, 0, n * sizeof(uint32_t));
// Now, adjust m and n for the Knuth division. n is the number of words in
// the divisor. m is the number of words by which the dividend exceeds the
@@ -1469,22 +1474,22 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
// are using base 2^32 instead of base 10.
assert(n != 0 && "Divide by zero?");
if (n == 1) {
- unsigned divisor = V[0];
- unsigned remainder = 0;
- for (int i = m+n-1; i >= 0; i--) {
- uint64_t partial_dividend = uint64_t(remainder) << 32 | U[i];
+ uint32_t divisor = V[0];
+ uint32_t remainder = 0;
+ for (int i = m; i >= 0; i--) {
+ uint64_t partial_dividend = Make_64(remainder, U[i]);
if (partial_dividend == 0) {
Q[i] = 0;
remainder = 0;
} else if (partial_dividend < divisor) {
Q[i] = 0;
- remainder = (unsigned)partial_dividend;
+ remainder = Lo_32(partial_dividend);
} else if (partial_dividend == divisor) {
Q[i] = 1;
remainder = 0;
} else {
- Q[i] = (unsigned)(partial_dividend / divisor);
- remainder = (unsigned)(partial_dividend - (Q[i] * divisor));
+ Q[i] = Lo_32(partial_dividend / divisor);
+ remainder = Lo_32(partial_dividend - (Q[i] * divisor));
}
}
if (R)
@@ -1498,24 +1503,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
// If the caller wants the quotient
if (Quotient) {
// Set up the Quotient value's memory.
- if (Quotient->BitWidth != LHS.BitWidth) {
- if (Quotient->isSingleWord())
- Quotient->U.VAL = 0;
- else
- delete [] Quotient->U.pVal;
- Quotient->BitWidth = LHS.BitWidth;
- if (!Quotient->isSingleWord())
- Quotient->U.pVal = getClearedMemory(Quotient->getNumWords());
- } else
- Quotient->clearAllBits();
+ Quotient->reallocate(LHS.BitWidth);
+ // Clear out any previous bits.
+ Quotient->clearAllBits();
// The quotient is in Q. Reconstitute the quotient into Quotient's low
// order words.
// This case is currently dead as all users of divide() handle trivial cases
// earlier.
if (lhsWords == 1) {
- uint64_t tmp =
- uint64_t(Q[0]) | (uint64_t(Q[1]) << (APINT_BITS_PER_WORD / 2));
+ uint64_t tmp = Make_64(Q[1], Q[0]);
if (Quotient->isSingleWord())
Quotient->U.VAL = tmp;
else
@@ -1523,30 +1520,21 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
} else {
assert(!Quotient->isSingleWord() && "Quotient APInt not large enough");
for (unsigned i = 0; i < lhsWords; ++i)
- Quotient->U.pVal[i] =
- uint64_t(Q[i*2]) | (uint64_t(Q[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+ Quotient->U.pVal[i] = Make_64(Q[i*2+1], Q[i*2]);
}
}
// If the caller wants the remainder
if (Remainder) {
// Set up the Remainder value's memory.
- if (Remainder->BitWidth != RHS.BitWidth) {
- if (Remainder->isSingleWord())
- Remainder->U.VAL = 0;
- else
- delete [] Remainder->U.pVal;
- Remainder->BitWidth = RHS.BitWidth;
- if (!Remainder->isSingleWord())
- Remainder->U.pVal = getClearedMemory(Remainder->getNumWords());
- } else
- Remainder->clearAllBits();
+ Remainder->reallocate(RHS.BitWidth);
+ // Clear out any previous bits.
+ Remainder->clearAllBits();
// The remainder is in R. Reconstitute the remainder into Remainder's low
// order words.
if (rhsWords == 1) {
- uint64_t tmp =
- uint64_t(R[0]) | (uint64_t(R[1]) << (APINT_BITS_PER_WORD / 2));
+ uint64_t tmp = Make_64(R[1], R[0]);
if (Remainder->isSingleWord())
Remainder->U.VAL = tmp;
else
@@ -1554,8 +1542,7 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
} else {
assert(!Remainder->isSingleWord() && "Remainder APInt not large enough");
for (unsigned i = 0; i < rhsWords; ++i)
- Remainder->U.pVal[i] =
- uint64_t(R[i*2]) | (uint64_t(R[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+ Remainder->U.pVal[i] = Make_64(R[i*2+1], R[i*2]);
}
}
@@ -1578,29 +1565,30 @@ APInt APInt::udiv(const APInt& RHS) const {
}
// Get some facts about the LHS and RHS number of bits and words
- unsigned rhsBits = RHS.getActiveBits();
- unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+ unsigned lhsWords = getNumWords(getActiveBits());
+ unsigned rhsBits = RHS.getActiveBits();
+ unsigned rhsWords = getNumWords(rhsBits);
assert(rhsWords && "Divided by zero???");
- unsigned lhsBits = this->getActiveBits();
- unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
// Deal with some degenerate cases
if (!lhsWords)
// 0 / X ===> 0
return APInt(BitWidth, 0);
- else if (lhsWords < rhsWords || this->ult(RHS)) {
+ if (rhsBits == 1)
+ // X / 1 ===> X
+ return *this;
+ if (lhsWords < rhsWords || this->ult(RHS))
// X / Y ===> 0, iff X < Y
return APInt(BitWidth, 0);
- } else if (*this == RHS) {
+ if (*this == RHS)
// X / X ===> 1
return APInt(BitWidth, 1);
- } else if (lhsWords == 1 && rhsWords == 1) {
+ if (lhsWords == 1) // rhsWords is 1 if lhsWords is 1.
// All high words are zero, just use native divide
return APInt(BitWidth, this->U.pVal[0] / RHS.U.pVal[0]);
- }
// We have to compute it the hard way. Invoke the Knuth divide algorithm.
- APInt Quotient(1,0); // to hold result.
+ APInt Quotient; // to hold result.
divide(*this, lhsWords, RHS, rhsWords, &Quotient, nullptr);
return Quotient;
}
@@ -1624,31 +1612,32 @@ APInt APInt::urem(const APInt& RHS) const {
}
// Get some facts about the LHS
- unsigned lhsBits = getActiveBits();
- unsigned lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+ unsigned lhsWords = getNumWords(getActiveBits());
// Get some facts about the RHS
unsigned rhsBits = RHS.getActiveBits();
- unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+ unsigned rhsWords = getNumWords(rhsBits);
assert(rhsWords && "Performing remainder operation by zero ???");
// Check the degenerate cases
- if (lhsWords == 0) {
+ if (lhsWords == 0)
// 0 % Y ===> 0
return APInt(BitWidth, 0);
- } else if (lhsWords < rhsWords || this->ult(RHS)) {
+ if (rhsBits == 1)
+ // X % 1 ===> 0
+ return APInt(BitWidth, 0);
+ if (lhsWords < rhsWords || this->ult(RHS))
// X % Y ===> X, iff X < Y
return *this;
- } else if (*this == RHS) {
+ if (*this == RHS)
// X % X == 0;
return APInt(BitWidth, 0);
- } else if (lhsWords == 1) {
+ if (lhsWords == 1)
// All high words are zero, just use native remainder
return APInt(BitWidth, U.pVal[0] % RHS.U.pVal[0]);
- }
// We have to compute it the hard way. Invoke the Knuth divide algorithm.
- APInt Remainder(1,0);
+ APInt Remainder;
divide(*this, lhsWords, RHS, rhsWords, nullptr, &Remainder);
return Remainder;
}
@@ -1667,22 +1656,23 @@ APInt APInt::srem(const APInt &RHS) const {
void APInt::udivrem(const APInt &LHS, const APInt &RHS,
APInt &Quotient, APInt &Remainder) {
assert(LHS.BitWidth == RHS.BitWidth && "Bit widths must be the same");
+ unsigned BitWidth = LHS.BitWidth;
// First, deal with the easy case
if (LHS.isSingleWord()) {
assert(RHS.U.VAL != 0 && "Divide by zero?");
uint64_t QuotVal = LHS.U.VAL / RHS.U.VAL;
uint64_t RemVal = LHS.U.VAL % RHS.U.VAL;
- Quotient = APInt(LHS.BitWidth, QuotVal);
- Remainder = APInt(LHS.BitWidth, RemVal);
+ Quotient = APInt(BitWidth, QuotVal);
+ Remainder = APInt(BitWidth, RemVal);
return;
}
// Get some size facts about the dividend and divisor
- unsigned lhsBits = LHS.getActiveBits();
- unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
+ unsigned lhsWords = getNumWords(LHS.getActiveBits());
unsigned rhsBits = RHS.getActiveBits();
- unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+ unsigned rhsWords = getNumWords(rhsBits);
+ assert(rhsWords && "Performing divrem operation by zero ???");
// Check the degenerate cases
if (lhsWords == 0) {
@@ -1691,6 +1681,11 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
return;
}
+ if (rhsBits == 1) {
+ Quotient = LHS; // X / 1 ===> X
+ Remainder = 0; // X % 1 ===> 0
+ }
+
if (lhsWords < rhsWords || LHS.ult(RHS)) {
Remainder = LHS; // X % Y ===> X, iff X < Y
Quotient = 0; // X / Y ===> 0, iff X < Y
@@ -1703,12 +1698,15 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
return;
}
- if (lhsWords == 1 && rhsWords == 1) {
+ if (lhsWords == 1) { // rhsWords is 1 if lhsWords is 1.
// There is only one word to consider so use the native versions.
- uint64_t lhsValue = LHS.isSingleWord() ? LHS.U.VAL : LHS.U.pVal[0];
- uint64_t rhsValue = RHS.isSingleWord() ? RHS.U.VAL : RHS.U.pVal[0];
- Quotient = APInt(LHS.getBitWidth(), lhsValue / rhsValue);
- Remainder = APInt(LHS.getBitWidth(), lhsValue % rhsValue);
+ uint64_t lhsValue = LHS.U.pVal[0];
+ uint64_t rhsValue = RHS.U.pVal[0];
+ // Make sure there is enough space to hold the results.
+ Quotient.reallocate(BitWidth);
+ Remainder.reallocate(BitWidth);
+ Quotient = lhsValue / rhsValue;
+ Remainder = lhsValue % rhsValue;
return;
}
@@ -1723,12 +1721,12 @@ void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
APInt::udivrem(-LHS, -RHS, Quotient, Remainder);
else {
APInt::udivrem(-LHS, RHS, Quotient, Remainder);
- Quotient = -Quotient;
+ Quotient.negate();
}
- Remainder = -Remainder;
+ Remainder.negate();
} else if (RHS.isNegative()) {
APInt::udivrem(LHS, -RHS, Quotient, Remainder);
- Quotient = -Quotient;
+ Quotient.negate();
} else {
APInt::udivrem(LHS, RHS, Quotient, Remainder);
}
@@ -1859,10 +1857,8 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
*this += digit;
}
// If its negative, put it in two's complement form
- if (isNeg) {
- --(*this);
- this->flipAllBits();
- }
+ if (isNeg)
+ this->negate();
}
void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
@@ -1940,8 +1936,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
// They want to print the signed version and it is a negative value
// Flip the bits and add one to turn it into the equivalent positive
// value and put a '-' in the result.
- Tmp.flipAllBits();
- ++Tmp;
+ Tmp.negate();
Str.push_back('-');
}
@@ -1961,22 +1956,19 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1));
unsigned MaskAmt = Radix - 1;
- while (Tmp != 0) {
+ while (Tmp.getBoolValue()) {
unsigned Digit = unsigned(Tmp.getRawData()[0]) & MaskAmt;
Str.push_back(Digits[Digit]);
Tmp.lshrInPlace(ShiftAmt);
}
} else {
- APInt divisor(Radix == 10? 4 : 8, Radix);
- while (Tmp != 0) {
- APInt APdigit(1, 0);
- APInt tmp2(Tmp.getBitWidth(), 0);
- divide(Tmp, Tmp.getNumWords(), divisor, divisor.getNumWords(), &tmp2,
- &APdigit);
+ APInt divisor(Tmp.getBitWidth(), Radix);
+ APInt APdigit;
+ while (Tmp.getBoolValue()) {
+ udivrem(Tmp, divisor, Tmp, APdigit);
unsigned Digit = (unsigned)APdigit.getZExtValue();
assert(Digit < Radix && "divide failed");
Str.push_back(Digits[Digit]);
- Tmp = tmp2;
}
}
@@ -2346,13 +2338,11 @@ int APInt::tcMultiply(WordType *dst, const WordType *lhs,
return overflow;
}
-/* DST = LHS * RHS, where DST has width the sum of the widths of the
- operands. No overflow occurs. DST must be disjoint from both
- operands. Returns the number of parts required to hold the
- result. */
-unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
- const WordType *rhs, unsigned lhsParts,
- unsigned rhsParts) {
+/// DST = LHS * RHS, where DST has width the sum of the widths of the
+/// operands. No overflow occurs. DST must be disjoint from both operands.
+void APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
+ const WordType *rhs, unsigned lhsParts,
+ unsigned rhsParts) {
/* Put the narrower number on the LHS for less loops below. */
if (lhsParts > rhsParts)
return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts);
@@ -2363,10 +2353,6 @@ unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
for (unsigned i = 0; i < lhsParts; i++)
tcMultiplyPart(&dst[i], rhs, lhs[i], 0, rhsParts, rhsParts + 1, true);
-
- unsigned n = lhsParts + rhsParts;
-
- return n - (dst[n - 1] == 0);
}
/* If RHS is zero LHS and REMAINDER are left unchanged, return one.
@@ -2400,22 +2386,20 @@ int APInt::tcDivide(WordType *lhs, const WordType *rhs,
/* Loop, subtracting SRHS if REMAINDER is greater and adding that to
the total. */
for (;;) {
- int compare;
-
- compare = tcCompare(remainder, srhs, parts);
- if (compare >= 0) {
- tcSubtract(remainder, srhs, 0, parts);
- lhs[n] |= mask;
- }
+ int compare = tcCompare(remainder, srhs, parts);
+ if (compare >= 0) {
+ tcSubtract(remainder, srhs, 0, parts);
+ lhs[n] |= mask;
+ }
- if (shiftCount == 0)
- break;
- shiftCount--;
- tcShiftRight(srhs, parts, 1);
- if ((mask >>= 1) == 0) {
- mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
- n--;
- }
+ if (shiftCount == 0)
+ break;
+ shiftCount--;
+ tcShiftRight(srhs, parts, 1);
+ if ((mask >>= 1) == 0) {
+ mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
+ n--;
+ }
}
return false;
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 63c440037c22..83376284548f 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -81,6 +81,7 @@ add_llvm_library(LLVMSupport
MD5.cpp
NativeFormatting.cpp
Options.cpp
+ Parallel.cpp
PluginLoader.cpp
PrettyStackTrace.cpp
RandomNumberGenerator.cpp
diff --git a/lib/Support/Parallel.cpp b/lib/Support/Parallel.cpp
new file mode 100644
index 000000000000..ab2cfdebf07d
--- /dev/null
+++ b/lib/Support/Parallel.cpp
@@ -0,0 +1,138 @@
+//===- llvm/Support/Parallel.cpp - Parallel algorithms --------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Parallel.h"
+#include "llvm/Config/llvm-config.h"
+
+#include <atomic>
+#include <stack>
+#include <thread>
+
+using namespace llvm;
+
+namespace {
+
+/// \brief An abstract class that takes closures and runs them asynchronously.
+class Executor {
+public:
+ virtual ~Executor() = default;
+ virtual void add(std::function<void()> func) = 0;
+
+ static Executor *getDefaultExecutor();
+};
+
+#if !LLVM_ENABLE_THREADS
+class SyncExecutor : public Executor {
+public:
+ virtual void add(std::function<void()> F) { F(); }
+};
+
+Executor *Executor::getDefaultExecutor() {
+ static SyncExecutor Exec;
+ return &Exec;
+}
+
+#elif defined(_MSC_VER)
+/// \brief An Executor that runs tasks via ConcRT.
+class ConcRTExecutor : public Executor {
+ struct Taskish {
+ Taskish(std::function<void()> Task) : Task(Task) {}
+
+ std::function<void()> Task;
+
+ static void run(void *P) {
+ Taskish *Self = static_cast<Taskish *>(P);
+ Self->Task();
+ concurrency::Free(Self);
+ }
+ };
+
+public:
+ virtual void add(std::function<void()> F) {
+ Concurrency::CurrentScheduler::ScheduleTask(
+ Taskish::run, new (concurrency::Alloc(sizeof(Taskish))) Taskish(F));
+ }
+};
+
+Executor *Executor::getDefaultExecutor() {
+ static ConcRTExecutor exec;
+ return &exec;
+}
+
+#else
+/// \brief An implementation of an Executor that runs closures on a thread pool
+/// in filo order.
+class ThreadPoolExecutor : public Executor {
+public:
+ explicit ThreadPoolExecutor(
+ unsigned ThreadCount = std::thread::hardware_concurrency())
+ : Done(ThreadCount) {
+ // Spawn all but one of the threads in another thread as spawning threads
+ // can take a while.
+ std::thread([&, ThreadCount] {
+ for (size_t i = 1; i < ThreadCount; ++i) {
+ std::thread([=] { work(); }).detach();
+ }
+ work();
+ }).detach();
+ }
+
+ ~ThreadPoolExecutor() override {
+ std::unique_lock<std::mutex> Lock(Mutex);
+ Stop = true;
+ Lock.unlock();
+ Cond.notify_all();
+ // Wait for ~Latch.
+ }
+
+ void add(std::function<void()> F) override {
+ std::unique_lock<std::mutex> Lock(Mutex);
+ WorkStack.push(F);
+ Lock.unlock();
+ Cond.notify_one();
+ }
+
+private:
+ void work() {
+ while (true) {
+ std::unique_lock<std::mutex> Lock(Mutex);
+ Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+ if (Stop)
+ break;
+ auto Task = WorkStack.top();
+ WorkStack.pop();
+ Lock.unlock();
+ Task();
+ }
+ Done.dec();
+ }
+
+ std::atomic<bool> Stop{false};
+ std::stack<std::function<void()>> WorkStack;
+ std::mutex Mutex;
+ std::condition_variable Cond;
+ parallel::detail::Latch Done;
+};
+
+Executor *Executor::getDefaultExecutor() {
+ static ThreadPoolExecutor exec;
+ return &exec;
+}
+#endif
+}
+
+#if LLVM_ENABLE_THREADS
+void parallel::detail::TaskGroup::spawn(std::function<void()> F) {
+ L.inc();
+ Executor::getDefaultExecutor()->add([&, F] {
+ F();
+ L.dec();
+ });
+}
+#endif
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index fa28ba1b6ab6..cdea09be41e0 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -103,13 +103,16 @@
#define STATVFS_F_FLAG(vfs) (vfs).f_flags
#endif
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+#include <sys/sysctl.h>
+#endif
+
using namespace llvm;
namespace llvm {
namespace sys {
namespace fs {
-#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
- defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
+#if defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \
defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__) || \
defined(_AIX)
static int
@@ -164,7 +167,7 @@ getprogpath(char ret[PATH_MAX], const char *bin)
free(pv);
return nullptr;
}
-#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__
+#endif // Bitrig || OpenBSD || minix || linux || CYGWIN || DragonFly || AIX
/// GetMainExecutable - Return the path to the main executable, given the
/// value of argv[0] from program startup.
@@ -180,9 +183,24 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
if (realpath(exe_path, link_path))
return link_path;
}
-#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
- defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \
- defined(__FreeBSD_kernel__) || defined(_AIX)
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__)
+ int mib[4];
+ mib[0] = CTL_KERN;
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+ mib[1] = KERN_PROC;
+ mib[2] = KERN_PROC_PATHNAME;
+ mib[3] = -1;
+#else
+ mib[1] = KERN_PROC_ARGS;
+ mib[2] = -1;
+ mib[3] = KERN_PROC_PATHNAME;
+#endif
+ char exe_path[PATH_MAX];
+ size_t cb = sizeof(exe_path);
+ if (sysctl(mib, 4, exe_path, &cb, NULL, 0) == 0)
+ return exe_path;
+#elif defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \
+ defined(__DragonFly__) || defined(_AIX)
char exe_path[PATH_MAX];
if (getprogpath(exe_path, argv0) != NULL)
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 16f8f5a98e52..1d0143c6716e 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -347,7 +347,7 @@ static bool terminalHasColors(int fd) {
MutexGuard G(*TermColorMutex);
int errret = 0;
- if (setupterm((char *)nullptr, fd, &errret) != 0)
+ if (setupterm(nullptr, fd, &errret) != 0)
// Regardless of why, if we can't get terminfo, we shouldn't try to print
// colors.
return false;
@@ -369,7 +369,7 @@ static bool terminalHasColors(int fd) {
// Now extract the structure allocated by setupterm and free its memory
// through a really silly dance.
- struct term *termp = set_curterm((struct term *)nullptr);
+ struct term *termp = set_curterm(nullptr);
(void)del_curterm(termp); // Drop any errors here.
// Return true if we found a color capabilities for the current terminal.
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 73f2b6a25f66..4af5fef4287c 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -216,6 +216,7 @@ def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON,
FeaturePerfMon
]>;
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index ff3e4c40e2c2..29f6d571d6bd 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -380,7 +380,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets);
}
- CallSeqStart.addImm(Handler.StackSize);
+ CallSeqStart.addImm(Handler.StackSize).addImm(0);
MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
.addImm(Handler.StackSize)
.addImm(0);
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 083708001757..9ac7ecb9cdb4 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3014,7 +3014,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
// Issue CALLSEQ_START
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
- .addImm(NumBytes);
+ .addImm(NumBytes).addImm(0);
// Process the args.
for (CCValAssign &VA : ArgLocs) {
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4b1bb27dce73..4f7c2e122390 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2265,7 +2265,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
SDValue Callee =
DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
- StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+ StructType *RetTy = StructType::get(ArgTy, ArgTy);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(DAG.getEntryNode())
@@ -3249,9 +3249,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall)
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL,
- true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
getPointerTy(DAG.getDataLayout()));
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index cb268828455e..c42738da7ab0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3427,6 +3427,10 @@ static bool getFMAPatterns(MachineInstr &Root,
Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
Found = true;
}
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
+ Found = true;
+ }
break;
case AArch64::FSUBDrr:
if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
@@ -3441,6 +3445,10 @@ static bool getFMAPatterns(MachineInstr &Root,
Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
Found = true;
}
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
+ Found = true;
+ }
break;
case AArch64::FSUBv2f32:
if (canCombineWithFMUL(MBB, Root.getOperand(2),
@@ -3495,6 +3503,8 @@ AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
case MachineCombinerPattern::FMULADDD_OP2:
case MachineCombinerPattern::FMULSUBD_OP1:
case MachineCombinerPattern::FMULSUBD_OP2:
+ case MachineCombinerPattern::FNMULSUBS_OP1:
+ case MachineCombinerPattern::FNMULSUBD_OP1:
case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
@@ -3996,6 +4006,24 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
}
+
+ case MachineCombinerPattern::FNMULSUBS_OP1:
+ case MachineCombinerPattern::FNMULSUBD_OP1: {
+ // FNMUL I=A,B,0
+ // FSUB R,I,C
+ // ==> FNMADD R,A,B,C // = -A*B - C
+ // --- Create(FNMADD);
+ if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
+ Opc = AArch64::FNMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FNMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ }
+
case MachineCombinerPattern::FMULSUBS_OP2:
case MachineCombinerPattern::FMULSUBD_OP2: {
// FMUL I=A,B,0
@@ -4011,6 +4039,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
+ }
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
Opc = AArch64::FMLSv1i32_indexed;
@@ -4067,7 +4096,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
FMAInstKind::Accumulator);
}
break;
- }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
DelInstrs.push_back(MUL);
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 902b08844216..5ddf66654a67 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -156,7 +156,8 @@ def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
- SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+ SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>,
[SDNPHasChain, SDNPOutGlue]>;
def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",
SDCallSeqEnd<[ SDTCisVT<0, i32>,
@@ -328,8 +329,9 @@ include "AArch64InstrFormats.td"
let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
// We set Sched to empty list because we expect these instructions to simply get
// removed in most cases.
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
- [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(AArch64callseq_start timm:$amt1, timm:$amt2)]>,
+ Sched<[]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
[(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
Sched<[]>;
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 5f895903da6f..789270c2a34b 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -529,9 +529,34 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// for the greedy mode the cost of the cross bank copy will
// offset this number.
// FIXME: Should be derived from the scheduling model.
- if (OpRegBankIdx[0] >= PMI_FirstFPR)
+ if (OpRegBankIdx[0] != PMI_FirstGPR)
Cost = 2;
+ else
+ // Check if that load feeds fp instructions.
+ // In that case, we want the default mapping to be on FPR
+ // instead of blind map every scalar to GPR.
+ for (const MachineInstr &UseMI :
+ MRI.use_instructions(MI.getOperand(0).getReg()))
+ // If we have at least one direct use in a FP instruction,
+ // assume this was a floating point load in the IR.
+ // If it was not, we would have had a bitcast before
+ // reaching that instruction.
+ if (isPreISelGenericFloatingPointOpcode(UseMI.getOpcode())) {
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ break;
+ }
break;
+ case TargetOpcode::G_STORE:
+ // Check if that store is fed by fp instructions.
+ if (OpRegBankIdx[0] == PMI_FirstGPR) {
+ unsigned VReg = MI.getOperand(0).getReg();
+ if (!VReg)
+ break;
+ MachineInstr *DefMI = MRI.getVRegDef(VReg);
+ if (isPreISelGenericFloatingPointOpcode(DefMI->getOpcode()))
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ break;
+ }
}
// Finally construct the computed mapping.
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 8f8eeef8a6cf..a9b4d44a523e 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -42,11 +42,11 @@ def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1
def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTXNv1i64)>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>;
-def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_1VXVY_5cyc], (instrs FMULX16, FMULX32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FMULX16, FMULX32)>;
-def : InstRW<[FalkorWr_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc], (instrs FMULX64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FMULX64)>;
def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>;
@@ -62,9 +62,9 @@ def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4
def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>;
def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>;
-def : InstRW<[FalkorWr_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
-def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
def : InstRW<[FalkorWr_3VXVY_4cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
@@ -72,13 +72,14 @@ def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i1
def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>;
-def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_2VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
-def : InstRW<[FalkorWr_2VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v2i64_indexed$")>;
// SIMD Integer Instructions
// -----------------------------------------------------------------------------
def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>;
@@ -119,10 +120,10 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)
def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)ADDLVv8i8v$")>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs ADDVv8i8v)>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQRDML(A|S)?H(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>;
@@ -169,9 +170,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL2?(v1i64|v2i64)$")>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>;
def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
def : InstRW<[FalkorWr_3VXVY_3cyc], (instregex "^(S|U)ADDLVv4i32v$")>;
@@ -185,8 +186,9 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instregex "^(S|U)ABALv.*$")>;
def : InstRW<[FalkorWr_4VXVY_4cyc], (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)(i16|i32)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)v.*$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)v[248].*$")>;
+
// SIMD Load Instructions
// -----------------------------------------------------------------------------
def : InstRW<[WriteVLD], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
@@ -294,9 +296,9 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv
def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPXv1i32, FRECPXv1i64)>;
def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs URECPEv2i32, URSQRTEv2i32)>;
-def : InstRW<[FalkorWr_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
-def : InstRW<[FalkorWr_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>;
def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>;
def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
@@ -311,9 +313,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs URECPEv4i32, URSQRTEv4i32)>;
def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs TBLv8i8Two)>;
def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^TBX(v8|v16)i8One$")>;
-def : InstRW<[FalkorWr_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>;
-def : InstRW<[FalkorWr_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>;
def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBL(v8i8Three|v16i8Two)$")>;
def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBX(v8i8Two|v16i8Two)$")>;
@@ -416,22 +418,25 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTSHr, FCVTDHr)>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTSDr, FCVTDSr)>;
-def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>;
def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>;
def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>;
-def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
-def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], (instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], (instregex "^F(N)?M(ADD|SUB)Drrr$")>;
// FP Miscellaneous Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
+def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(H|S|D)i$")>;
+def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hi|Hr|S0|Si|Sr|D0|Di|Dr|v.*_ns)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hr|Sr|Dr|v.*_ns)$")>;
+// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov 0.0
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs FMOVD0, FMOVS0)>;
def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>;
@@ -475,16 +480,17 @@ def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^EXTR(W|X)rri$")>;
// Divide and Multiply Instructions
// -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1X_4cyc], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
-def : InstRW<[FalkorWr_1X_4cyc], (instregex "^M(ADD|SUB)Wrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], (instregex "^M(ADD|SUB)Wrrr$")>;
-def : InstRW<[FalkorWr_1X_5cyc], (instregex "^(S|U)MULHrr$")>;
-def : InstRW<[FalkorWr_1X_5cyc], (instregex "^M(ADD|SUB)Xrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^M(ADD|SUB)Xrrr$")>;
def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>;
def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)(MLAL|MLSL|MULL)v.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(S|U)MULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^(S|U)(MLAL|MLSL)v.*$")>;
// Move and Shift Instructions
// -----------------------------------------------------------------------------
diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
index e64b2c441a19..6526cc28e806 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
@@ -29,8 +29,9 @@
// Define 1 micro-op types
def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; }
-def FalkorWr_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
-def FalkorWr_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
+def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; }
def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; }
def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; }
@@ -45,8 +46,10 @@ def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
-def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; }
def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; }
@@ -75,14 +78,26 @@ def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
let Latency = 4;
let NumMicroOps = 2;
}
+def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
let Latency = 5;
let NumMicroOps = 2;
}
+def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
let Latency = 6;
let NumMicroOps = 2;
}
+def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
let Latency = 4;
@@ -350,18 +365,17 @@ def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
let NumMicroOps = 9;
}
-// Forwarding logic is modeled for vector multiply and accumulate
+// Forwarding logic is modeled for multiply add/accumulate.
// -----------------------------------------------------------------------------
-def FalkorReadVMA : SchedReadAdvance<2, [FalkorWr_1VXVY_4cyc,
- FalkorWr_2VXVY_4cyc]>;
-def FalkorReadFMA : SchedReadAdvance<3, [FalkorWr_1VXVY_5cyc,
- FalkorWr_1VXVY_6cyc,
- FalkorWr_2VXVY_5cyc,
- FalkorWr_2VXVY_6cyc]>;
+def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
+def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
+def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>;
+def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
+def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
// SchedPredicates and WriteVariants for Immediate Zero and LSLFast
// -----------------------------------------------------------------------------
-def FalkorImmZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
+def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>;
def FalkorWr_FMOV : SchedWriteVariant<[
@@ -378,7 +392,6 @@ def FalkorWr_LDR : SchedWriteVariant<[
def FalkorWr_ADD : SchedWriteVariant<[
SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>,
- SchedVar<FalkorImmZPred, [FalkorWr_1XYZ_1cyc]>,
SchedVar<NoSchedPred, [FalkorWr_2XYZ_2cyc]>]>;
def FalkorWr_PRFM : SchedWriteVariant<[
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index abdeac019a18..1c81d34014fd 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -91,6 +91,8 @@ void AArch64Subtarget::initializeProperties() {
case Falkor:
MaxInterleaveFactor = 4;
VectorInsertExtractBaseCost = 2;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
break;
case Kryo:
MaxInterleaveFactor = 4;
@@ -99,6 +101,8 @@ void AArch64Subtarget::initializeProperties() {
PrefetchDistance = 740;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 11;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
break;
case ThunderX2T99:
CacheLineSize = 64;
@@ -108,6 +112,8 @@ void AArch64Subtarget::initializeProperties() {
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
break;
case ThunderX:
case ThunderXT88:
@@ -116,6 +122,8 @@ void AArch64Subtarget::initializeProperties() {
CacheLineSize = 128;
PrefFunctionAlignment = 3;
PrefLoopAlignment = 2;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
break;
case CortexA35: break;
case CortexA53: break;
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 5b9bee6e41b8..df54bf3f48e1 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -83,6 +83,9 @@ protected:
// NegativeImmediates - transform instructions with negative immediates
bool NegativeImmediates = true;
+ // Enable 64-bit vectorization in SLP.
+ unsigned MinVectorRegisterBitWidth = 64;
+
bool UseAA = false;
bool PredictableSelectIsExpensive = false;
bool BalanceFPOps = false;
@@ -106,6 +109,7 @@ protected:
unsigned PrefFunctionAlignment = 0;
unsigned PrefLoopAlignment = 0;
unsigned MaxJumpTableSize = 0;
+ unsigned WideningBaseCost = 0;
// ReserveX18 - X18 is not available as a general purpose register.
bool ReserveX18;
@@ -190,6 +194,10 @@ public:
bool isXRaySupported() const override { return true; }
+ unsigned getMinVectorRegisterBitWidth() const {
+ return MinVectorRegisterBitWidth;
+ }
+
bool isX18Reserved() const { return ReserveX18; }
bool hasFPARMv8() const { return HasFPARMv8; }
bool hasNEON() const { return HasNEON; }
@@ -228,6 +236,8 @@ public:
unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
+ unsigned getWideningBaseCost() const { return WideningBaseCost; }
+
/// CPU has TBI (top byte of addresses is ignored during HW address
/// translation) and OS enables it.
bool supportsAddressTopByteIgnored() const;
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 8875f9b72647..12a2e9a867f0 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -70,3 +70,11 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
return MCBinaryExpr::createSub(Res, PC, getContext());
}
+
+void AArch64_MachoTargetObjectFile::getNameWithPrefix(
+ SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+ const TargetMachine &TM) const {
+ // AArch64 does not use section-relative relocations so any global symbol must
+ // be accessed via at least a linker-private symbol.
+ getMangler().getNameWithPrefix(OutName, GV, /* CannotUsePrivateLabel */ true);
+}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 05e1dfa9e6c9..47e3bce43f6e 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -40,6 +40,9 @@ public:
const MCValue &MV, int64_t Offset,
MachineModuleInfo *MMI,
MCStreamer &Streamer) const override;
+
+ void getNameWithPrefix(SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+ const TargetMachine &TM) const override;
};
} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4d59da0c646d..7c6f55c06bce 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -176,11 +176,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
+bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
+ ArrayRef<const Value *> Args) {
+
+ // A helper that returns a vector type from the given type. The number of
+ // elements in type Ty determine the vector width.
+ auto toVectorTy = [&](Type *ArgTy) {
+ return VectorType::get(ArgTy->getScalarType(),
+ DstTy->getVectorNumElements());
+ };
+
+ // Exit early if DstTy is not a vector type whose elements are at least
+ // 16-bits wide.
+ if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
+ return false;
+
+ // Determine if the operation has a widening variant. We consider both the
+ // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
+ // instructions.
+ //
+ // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+ // verify that their extending operands are eliminated during code
+ // generation.
+ switch (Opcode) {
+ case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
+ case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+ break;
+ default:
+ return false;
+ }
+
+ // To be a widening instruction (either the "wide" or "long" versions), the
+ // second operand must be a sign- or zero extend having a single user. We
+ // only consider extends having a single user because they may otherwise not
+ // be eliminated.
+ if (Args.size() != 2 ||
+ (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
+ !Args[1]->hasOneUse())
+ return false;
+ auto *Extend = cast<CastInst>(Args[1]);
+
+ // Legalize the destination type and ensure it can be used in a widening
+ // operation.
+ auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
+ unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
+ if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
+ return false;
+
+ // Legalize the source type and ensure it can be used in a widening
+ // operation.
+ Type *SrcTy = toVectorTy(Extend->getSrcTy());
+ auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
+ unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
+ if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
+ return false;
+
+ // Get the total number of vector elements in the legalized types.
+ unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
+ unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
+
+ // Return true if the legalized types have the same number of vector elements
+ // and the destination element type size is twice that of the source type.
+ return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
+}
+
int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ // If the cast is observable, and it is used by a widening instruction (e.g.,
+ // uaddl, saddw, etc.), it may be free.
+ if (I && I->hasOneUse()) {
+ auto *SingleUser = cast<Instruction>(*I->user_begin());
+ SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
+ if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
+ // If the cast is the second operand, it is free. We will generate either
+ // a "wide" or "long" version of the widening instruction.
+ if (I == SingleUser->getOperand(1))
+ return 0;
+ // If the cast is not the second operand, it will be free if it looks the
+ // same as the second operand. In this case, we will generate a "long"
+ // version of the widening instruction.
+ if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
+ if (I->getOpcode() == Cast->getOpcode() &&
+ cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
+ return 0;
+ }
+ }
+
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
@@ -379,6 +463,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
+ // add in the widening overhead specified by the sub-target. Since the
+ // extends feeding widening instructions are performed automatically, they
+ // aren't present in the generated code and have a zero cost. By adding a
+ // widening overhead here, we attach the total cost of the combined operation
+ // to the widening instruction.
+ int Cost = 0;
+ if (isWideningInstruction(Ty, Opcode, Args))
+ Cost += ST->getWideningBaseCost();
+
int ISD = TLI->InstructionOpcodeToISD(Opcode);
if (ISD == ISD::SDIV &&
@@ -388,9 +482,9 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
// The OperandValue properties many not be same as that of previous
// operation; conservatively assume OP_None.
- int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
@@ -405,8 +499,8 @@ int AArch64TTIImpl::getArithmeticInstrCost(
switch (ISD) {
default:
- return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
case ISD::ADD:
case ISD::MUL:
case ISD::XOR:
@@ -414,7 +508,7 @@ int AArch64TTIImpl::getArithmeticInstrCost(
case ISD::AND:
// These nodes are marked as 'custom' for combining purposes only.
// We know that they are legal. See LowerAdd in ISelLowering.
- return 1 * LT.first;
+ return (Cost + 1) * LT.first;
}
}
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index e37c003e064c..280d97f3c502 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -43,6 +43,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
VECTOR_LDST_FOUR_ELEMENTS
};
+ bool isWideningInstruction(Type *Ty, unsigned Opcode,
+ ArrayRef<const Value *> Args);
+
public:
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -84,6 +87,10 @@ public:
return 64;
}
+ unsigned getMinVectorRegisterBitWidth() {
+ return ST->getMinVectorRegisterBitWidth();
+ }
+
unsigned getMaxInterleaveFactor(unsigned VF);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
@@ -134,6 +141,10 @@ public:
unsigned getMinPrefetchStride();
unsigned getMaxPrefetchIterationsAhead();
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const {
+ return false;
+ }
/// @}
};
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 4dbcc9581a84..449d732a8d44 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3904,10 +3904,14 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
return false;
}
+static SMLoc incrementLoc(SMLoc L, int Offset) {
+ return SMLoc::getFromPointer(L.getPointer() + Offset);
+}
+
/// parseDirectiveCPU
/// ::= .cpu id
bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
- SMLoc CPULoc = getLoc();
+ SMLoc CurLoc = getLoc();
StringRef CPU, ExtensionString;
std::tie(CPU, ExtensionString) =
@@ -3923,15 +3927,19 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
// FIXME This is using tablegen data, but should be moved to ARMTargetParser
// once that is tablegen'ed
if (!getSTI().isCPUStringValid(CPU)) {
- Error(CPULoc, "unknown CPU name");
+ Error(CurLoc, "unknown CPU name");
return false;
}
MCSubtargetInfo &STI = copySTI();
STI.setDefaultFeatures(CPU, "");
+ CurLoc = incrementLoc(CurLoc, CPU.size());
FeatureBitset Features = STI.getFeatureBits();
for (auto Name : RequestedExtensions) {
+ // Advance source location past '+'.
+ CurLoc = incrementLoc(CurLoc, 1);
+
bool EnableFeature = true;
if (Name.startswith_lower("no")) {
@@ -3939,6 +3947,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
Name = Name.substr(2);
}
+ bool FoundExtension = false;
for (const auto &Extension : ExtensionMap) {
if (Extension.Name != Name)
continue;
@@ -3952,9 +3961,15 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
uint64_t Features =
ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
setAvailableFeatures(Features);
+ FoundExtension = true;
break;
}
+
+ if (!FoundExtension)
+ Error(CurLoc, "unsupported architectural extension");
+
+ CurLoc = incrementLoc(CurLoc, Name.size());
}
return false;
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 94112849f84e..1b28df963b40 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -32,8 +32,9 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
- // We prefer NEON instructions to be printed in the short form.
- AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+ // We prefer NEON instructions to be printed in the short, Apple-specific
+ // form when targeting Darwin.
+ AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant;
PrivateGlobalPrefix = "L";
PrivateLabelPrefix = "L";
@@ -68,8 +69,9 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
if (T.getArch() == Triple::aarch64_be)
IsLittleEndian = false;
- // We prefer NEON instructions to be printed in the short form.
- AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+ // We prefer NEON instructions to be printed in the generic form when
+ // targeting ELF.
+ AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant;
CodePointerSize = 8;
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 8f6e1e7d8846..3f89702bed50 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -50,6 +50,10 @@ FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+
+void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
+extern char &AMDGPUMachineCFGStructurizerID;
ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 2e5b78bbf7ef..b279bd61e180 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -61,6 +61,24 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
"Support flat address space"
>;
+def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
+ "FlatInstOffsets",
+ "true",
+ "Flat instructions have immediate offset addressing mode"
+>;
+
+def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
+ "FlatGlobalInsts",
+ "true",
+ "Have global_* flat memory instructions"
+>;
+
+def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
+ "FlatScratchInsts",
+ "true",
+ "Have scratch_* flat memory instructions"
+>;
+
def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
"UnalignedBufferAccess",
"true",
@@ -407,7 +425,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
- FeatureFastFMAF32, FeatureDPP
+ FeatureFastFMAF32, FeatureDPP,
+ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
]
>;
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ccae36ced1f8..7c99752b881f 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -136,8 +136,7 @@ private:
bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
SDValue &ImmOffset, SDValue &VOffset) const;
- bool SelectFlat(SDValue Addr, SDValue &VAddr,
- SDValue &SLC, SDValue &TFE) const;
+ bool SelectFlat(SDValue Addr, SDValue &VAddr, SDValue &SLC) const;
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
bool &Imm) const;
@@ -1278,10 +1277,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
SDValue &VAddr,
- SDValue &SLC,
- SDValue &TFE) const {
+ SDValue &SLC) const {
VAddr = Addr;
- TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+ SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 915d1d9e0e68..f80652b87373 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -567,13 +567,19 @@ static bool hasSourceMods(const SDNode *N) {
case AMDGPUISD::INTERP_P1:
case AMDGPUISD::INTERP_P2:
case AMDGPUISD::DIV_SCALE:
+
+ // TODO: Should really be looking at the users of the bitcast. These are
+ // problematic because bitcasts are used to legalize all stores to integer
+ // types.
+ case ISD::BITCAST:
return false;
default:
return true;
}
}
-static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
+bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
+ unsigned CostThreshold) {
// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
// it is truly free to use a source modifier in all cases. If there are
// multiple users but for each one will necessitate using VOP3, there will be
@@ -2299,7 +2305,7 @@ static bool isU24(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
DAG.computeKnownBits(Op, Known);
- return (VT.getSizeInBits() - Known.Zero.countLeadingOnes()) <= 24;
+ return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
}
static bool isI24(SDValue Op, SelectionDAG &DAG) {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index e1a5a2072418..4c588a7bafd0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -132,6 +132,8 @@ public:
return false;
}
+ static bool allUsesHaveSourceMods(const SDNode *N,
+ unsigned CostThreshold = 4);
bool isFAbsFree(EVT VT) const override;
bool isFNegFree(EVT VT) const override;
bool isTruncateFree(EVT Src, EVT Dest) const override;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8867ed689a31..a7eac080f885 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -127,9 +127,9 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
.add(I.getOperand(1))
.add(I.getOperand(0))
.addImm(0)
- .addImm(0)
.addImm(0);
+
// Now that we selected an opcode, we need to constrain the register
// operands to use appropriate classes.
bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
@@ -393,7 +393,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
.add(I.getOperand(0))
.addReg(PtrReg)
.addImm(0)
- .addImm(0)
.addImm(0);
bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a2567a549028..9de302994e68 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -33,6 +33,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
const LLT P1 = LLT::pointer(1, 64);
const LLT P2 = LLT::pointer(2, 64);
+ setAction({G_CONSTANT, S32}, Legal);
setAction({G_CONSTANT, S64}, Legal);
setAction({G_GEP, P1}, Legal);
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
new file mode 100644
index 000000000000..6d2785ba1c60
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -0,0 +1,2881 @@
+//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level CFG structurizer pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegionInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <tuple>
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpucfgstructurizer"
+
+namespace {
+class PHILinearizeDestIterator;
+
+class PHILinearize {
+ friend class PHILinearizeDestIterator;
+
+public:
+ typedef std::pair<unsigned, MachineBasicBlock *> PHISourceT;
+
+private:
+ typedef DenseSet<PHISourceT> PHISourcesT;
+ typedef struct {
+ unsigned DestReg;
+ DebugLoc DL;
+ PHISourcesT Sources;
+ } PHIInfoElementT;
+ typedef SmallPtrSet<PHIInfoElementT *, 2> PHIInfoT;
+ PHIInfoT PHIInfo;
+
+ static unsigned phiInfoElementGetDest(PHIInfoElementT *Info);
+ static void phiInfoElementSetDef(PHIInfoElementT *Info, unsigned NewDef);
+ static PHISourcesT &phiInfoElementGetSources(PHIInfoElementT *Info);
+ static void phiInfoElementAddSource(PHIInfoElementT *Info, unsigned SourceReg,
+ MachineBasicBlock *SourceMBB);
+ static void phiInfoElementRemoveSource(PHIInfoElementT *Info,
+ unsigned SourceReg,
+ MachineBasicBlock *SourceMBB);
+ PHIInfoElementT *findPHIInfoElement(unsigned DestReg);
+ PHIInfoElementT *findPHIInfoElementFromSource(unsigned SourceReg,
+ MachineBasicBlock *SourceMBB);
+
+public:
+ bool findSourcesFromMBB(MachineBasicBlock *SourceMBB,
+ SmallVector<unsigned, 4> &Sources);
+ void addDest(unsigned DestReg, const DebugLoc &DL);
+ void replaceDef(unsigned OldDestReg, unsigned NewDestReg);
+ void deleteDef(unsigned DestReg);
+ void addSource(unsigned DestReg, unsigned SourceReg,
+ MachineBasicBlock *SourceMBB);
+ void removeSource(unsigned DestReg, unsigned SourceReg,
+ MachineBasicBlock *SourceMBB = nullptr);
+ bool findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB,
+ unsigned &DestReg);
+ bool isSource(unsigned Reg, MachineBasicBlock *SourceMBB = nullptr);
+ unsigned getNumSources(unsigned DestReg);
+ void dump(MachineRegisterInfo *MRI);
+ void clear();
+
+ typedef PHISourcesT::iterator source_iterator;
+ typedef PHILinearizeDestIterator dest_iterator;
+
+ dest_iterator dests_begin();
+ dest_iterator dests_end();
+
+ source_iterator sources_begin(unsigned Reg);
+ source_iterator sources_end(unsigned Reg);
+};
+
+class PHILinearizeDestIterator {
+private:
+ PHILinearize::PHIInfoT::iterator Iter;
+
+public:
+ unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); }
+ PHILinearizeDestIterator &operator++() {
+ ++Iter;
+ return *this;
+ }
+ bool operator==(const PHILinearizeDestIterator &I) const {
+ return I.Iter == Iter;
+ }
+ bool operator!=(const PHILinearizeDestIterator &I) const {
+ return I.Iter != Iter;
+ }
+
+ PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {}
+};
+
+unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) {
+ return Info->DestReg;
+}
+
+void PHILinearize::phiInfoElementSetDef(PHIInfoElementT *Info,
+ unsigned NewDef) {
+ Info->DestReg = NewDef;
+}
+
+PHILinearize::PHISourcesT &
+PHILinearize::phiInfoElementGetSources(PHIInfoElementT *Info) {
+ return Info->Sources;
+}
+
+void PHILinearize::phiInfoElementAddSource(PHIInfoElementT *Info,
+ unsigned SourceReg,
+ MachineBasicBlock *SourceMBB) {
+ // Assertion ensures we don't use the same SourceMBB for the
+ // sources, because we cannot have different registers with
+ // identical predecessors, but we can have the same register for
+ // multiple predecessors.
+#if !defined(NDEBUG)
+ for (auto SI : phiInfoElementGetSources(Info)) {
+ assert((SI.second != SourceMBB || SourceReg == SI.first));
+ }
+#endif
+
+ phiInfoElementGetSources(Info).insert(PHISourceT(SourceReg, SourceMBB));
+}
+
+void PHILinearize::phiInfoElementRemoveSource(PHIInfoElementT *Info,
+ unsigned SourceReg,
+ MachineBasicBlock *SourceMBB) {
+ auto &Sources = phiInfoElementGetSources(Info);
+ SmallVector<PHISourceT, 4> ElimiatedSources;
+ for (auto SI : Sources) {
+ if (SI.first == SourceReg &&
+ (SI.second == nullptr || SI.second == SourceMBB)) {
+ ElimiatedSources.push_back(PHISourceT(SI.first, SI.second));
+ }
+ }
+
+ for (auto &Source : ElimiatedSources) {
+ Sources.erase(Source);
+ }
+}
+
+PHILinearize::PHIInfoElementT *
+PHILinearize::findPHIInfoElement(unsigned DestReg) {
+ for (auto I : PHIInfo) {
+ if (phiInfoElementGetDest(I) == DestReg) {
+ return I;
+ }
+ }
+ return nullptr;
+}
+
+PHILinearize::PHIInfoElementT *
+PHILinearize::findPHIInfoElementFromSource(unsigned SourceReg,
+ MachineBasicBlock *SourceMBB) {
+ for (auto I : PHIInfo) {
+ for (auto SI : phiInfoElementGetSources(I)) {
+ if (SI.first == SourceReg &&
+ (SI.second == nullptr || SI.second == SourceMBB)) {
+ return I;
+ }
+ }
+ }
+ return nullptr;
+}
+
+bool PHILinearize::findSourcesFromMBB(MachineBasicBlock *SourceMBB,
+ SmallVector<unsigned, 4> &Sources) {
+ bool FoundSource = false;
+ for (auto I : PHIInfo) {
+ for (auto SI : phiInfoElementGetSources(I)) {
+ if (SI.second == SourceMBB) {
+ FoundSource = true;
+ Sources.push_back(SI.first);
+ }
+ }
+ }
+ return FoundSource;
+}
+
+void PHILinearize::addDest(unsigned DestReg, const DebugLoc &DL) {
+ assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exsists");
+ PHISourcesT EmptySet;
+ PHIInfoElementT *NewElement = new PHIInfoElementT();
+ NewElement->DestReg = DestReg;
+ NewElement->DL = DL;
+ NewElement->Sources = EmptySet;
+ PHIInfo.insert(NewElement);
+}
+
+void PHILinearize::replaceDef(unsigned OldDestReg, unsigned NewDestReg) {
+ phiInfoElementSetDef(findPHIInfoElement(OldDestReg), NewDestReg);
+}
+
+void PHILinearize::deleteDef(unsigned DestReg) {
+ PHIInfoElementT *InfoElement = findPHIInfoElement(DestReg);
+ PHIInfo.erase(InfoElement);
+ delete InfoElement;
+}
+
+void PHILinearize::addSource(unsigned DestReg, unsigned SourceReg,
+ MachineBasicBlock *SourceMBB) {
+ phiInfoElementAddSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
+}
+
+void PHILinearize::removeSource(unsigned DestReg, unsigned SourceReg,
+ MachineBasicBlock *SourceMBB) {
+ phiInfoElementRemoveSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
+}
+
+bool PHILinearize::findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB,
+ unsigned &DestReg) {
+ PHIInfoElementT *InfoElement =
+ findPHIInfoElementFromSource(SourceReg, SourceMBB);
+ if (InfoElement != nullptr) {
+ DestReg = phiInfoElementGetDest(InfoElement);
+ return true;
+ }
+ return false;
+}
+
+bool PHILinearize::isSource(unsigned Reg, MachineBasicBlock *SourceMBB) {
+ unsigned DestReg;
+ return findDest(Reg, SourceMBB, DestReg);
+}
+
+unsigned PHILinearize::getNumSources(unsigned DestReg) {
+ return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size();
+}
+
+void PHILinearize::dump(MachineRegisterInfo *MRI) {
+ const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+ dbgs() << "=PHIInfo Start=\n";
+ for (auto PII : this->PHIInfo) {
+ PHIInfoElementT &Element = *PII;
+ dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI)
+ << " Sources: {";
+ for (auto &SI : Element.Sources) {
+ dbgs() << PrintReg(SI.first, TRI) << "(BB#"
+ << SI.second->getNumber() << "),";
+ }
+ dbgs() << "}\n";
+ }
+ dbgs() << "=PHIInfo End=\n";
+}
+
+void PHILinearize::clear() { PHIInfo = PHIInfoT(); }
+
+PHILinearize::dest_iterator PHILinearize::dests_begin() {
+ return PHILinearizeDestIterator(PHIInfo.begin());
+}
+
+PHILinearize::dest_iterator PHILinearize::dests_end() {
+ return PHILinearizeDestIterator(PHIInfo.end());
+}
+
+PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) {
+ auto InfoElement = findPHIInfoElement(Reg);
+ return phiInfoElementGetSources(InfoElement).begin();
+}
+PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) {
+ auto InfoElement = findPHIInfoElement(Reg);
+ return phiInfoElementGetSources(InfoElement).end();
+}
+
+class RegionMRT;
+class MBBMRT;
+
+static unsigned getPHINumInputs(MachineInstr &PHI) {
+ assert(PHI.isPHI());
+ return (PHI.getNumOperands() - 1) / 2;
+}
+
+static MachineBasicBlock *getPHIPred(MachineInstr &PHI, unsigned Index) {
+ assert(PHI.isPHI());
+ return PHI.getOperand(Index * 2 + 2).getMBB();
+}
+
+static void setPhiPred(MachineInstr &PHI, unsigned Index,
+ MachineBasicBlock *NewPred) {
+ PHI.getOperand(Index * 2 + 2).setMBB(NewPred);
+}
+
+static unsigned getPHISourceReg(MachineInstr &PHI, unsigned Index) {
+ assert(PHI.isPHI());
+ return PHI.getOperand(Index * 2 + 1).getReg();
+}
+
+static unsigned getPHIDestReg(MachineInstr &PHI) {
+ assert(PHI.isPHI());
+ return PHI.getOperand(0).getReg();
+}
+
+class LinearizedRegion {
+protected:
+ MachineBasicBlock *Entry;
+ // The exit block is part of the region, and is the last
+ // merge block before exiting the region.
+ MachineBasicBlock *Exit;
+ DenseSet<unsigned> LiveOuts;
+ SmallPtrSet<MachineBasicBlock *, 1> MBBs;
+ bool HasLoop;
+ LinearizedRegion *Parent;
+ RegionMRT *RMRT;
+
+ void storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+ MachineInstr *DefInstr, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+ void storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+ MachineInstr *DefInstr,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo);
+
+ void storeMBBLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo,
+ RegionMRT *TopRegion);
+
+ void storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+ void storeLiveOuts(RegionMRT *Region, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo,
+ RegionMRT *TopRegion = nullptr);
+
+public:
+ void setRegionMRT(RegionMRT *Region) { RMRT = Region; }
+
+ RegionMRT *getRegionMRT() { return RMRT; }
+
+ void setParent(LinearizedRegion *P) { Parent = P; }
+
+ LinearizedRegion *getParent() { return Parent; }
+
+ void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr);
+
+ void setBBSelectRegIn(unsigned Reg);
+
+ unsigned getBBSelectRegIn();
+
+ void setBBSelectRegOut(unsigned Reg, bool IsLiveOut);
+
+ unsigned getBBSelectRegOut();
+
+ void setHasLoop(bool Value);
+
+ bool getHasLoop();
+
+ void addLiveOut(unsigned VReg);
+
+ void removeLiveOut(unsigned Reg);
+
+ void replaceLiveOut(unsigned OldReg, unsigned NewReg);
+
+ void replaceRegister(unsigned Register, unsigned NewRegister,
+ MachineRegisterInfo *MRI, bool ReplaceInside,
+ bool ReplaceOutside, bool IncludeLoopPHIs);
+
+ void replaceRegisterInsideRegion(unsigned Register, unsigned NewRegister,
+ bool IncludeLoopPHIs,
+ MachineRegisterInfo *MRI);
+
+ void replaceRegisterOutsideRegion(unsigned Register, unsigned NewRegister,
+ bool IncludeLoopPHIs,
+ MachineRegisterInfo *MRI);
+
+ DenseSet<unsigned> *getLiveOuts();
+
+ void setEntry(MachineBasicBlock *NewEntry);
+
+ MachineBasicBlock *getEntry();
+
+ void setExit(MachineBasicBlock *NewExit);
+
+ MachineBasicBlock *getExit();
+
+ void addMBB(MachineBasicBlock *MBB);
+
+ void addMBBs(LinearizedRegion *InnerRegion);
+
+ bool contains(MachineBasicBlock *MBB);
+
+ bool isLiveOut(unsigned Reg);
+
+ bool hasNoDef(unsigned Reg, MachineRegisterInfo *MRI);
+
+ void removeFalseRegisterKills(MachineRegisterInfo *MRI);
+
+ void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+ LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+ LinearizedRegion();
+
+ ~LinearizedRegion();
+};
+
+class MRT {
+protected:
+ RegionMRT *Parent;
+ unsigned BBSelectRegIn;
+ unsigned BBSelectRegOut;
+
+public:
+ unsigned getBBSelectRegIn() { return BBSelectRegIn; }
+
+ unsigned getBBSelectRegOut() { return BBSelectRegOut; }
+
+ void setBBSelectRegIn(unsigned Reg) { BBSelectRegIn = Reg; }
+
+ void setBBSelectRegOut(unsigned Reg) { BBSelectRegOut = Reg; }
+
+ virtual RegionMRT *getRegionMRT() { return nullptr; }
+
+ virtual MBBMRT *getMBBMRT() { return nullptr; }
+
+ bool isRegion() { return getRegionMRT() != nullptr; }
+
+ bool isMBB() { return getMBBMRT() != nullptr; }
+
+ bool isRoot() { return Parent == nullptr; }
+
+ void setParent(RegionMRT *Region) { Parent = Region; }
+
+ RegionMRT *getParent() { return Parent; }
+
+ static MachineBasicBlock *
+ initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
+ DenseMap<MachineRegion *, RegionMRT *> &RegionMap);
+
+ static RegionMRT *buildMRT(MachineFunction &MF,
+ const MachineRegionInfo *RegionInfo,
+ const SIInstrInfo *TII,
+ MachineRegisterInfo *MRI);
+
+ virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) = 0;
+
+ void dumpDepth(int depth) {
+ for (int i = depth; i > 0; --i) {
+ dbgs() << " ";
+ }
+ }
+
+ virtual ~MRT() {}
+};
+
+class MBBMRT : public MRT {
+ MachineBasicBlock *MBB;
+
+public:
+ virtual MBBMRT *getMBBMRT() { return this; }
+
+ MachineBasicBlock *getMBB() { return MBB; }
+
+ virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+ dumpDepth(depth);
+ dbgs() << "MBB: " << getMBB()->getNumber();
+ dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
+ dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+ }
+
+ MBBMRT(MachineBasicBlock *BB) : MBB(BB) {
+ setParent(nullptr);
+ setBBSelectRegOut(0);
+ setBBSelectRegIn(0);
+ }
+};
+
+class RegionMRT : public MRT {
+protected:
+ MachineRegion *Region;
+ LinearizedRegion *LRegion;
+ MachineBasicBlock *Succ;
+
+ SetVector<MRT *> Children;
+
+public:
+ virtual RegionMRT *getRegionMRT() { return this; }
+
+ void setLinearizedRegion(LinearizedRegion *LinearizeRegion) {
+ LRegion = LinearizeRegion;
+ }
+
+ LinearizedRegion *getLinearizedRegion() { return LRegion; }
+
+ MachineRegion *getMachineRegion() { return Region; }
+
+ unsigned getInnerOutputRegister() {
+ return (*(Children.begin()))->getBBSelectRegOut();
+ }
+
+ void addChild(MRT *Tree) { Children.insert(Tree); }
+
+ SetVector<MRT *> *getChildren() { return &Children; }
+
+ virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+ dumpDepth(depth);
+ dbgs() << "Region: " << (void *)Region;
+ dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
+ dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+
+ dumpDepth(depth);
+ if (getSucc())
+ dbgs() << "Succ: " << getSucc()->getNumber() << "\n";
+ else
+ dbgs() << "Succ: none \n";
+ for (auto MRTI : Children) {
+ MRTI->dump(TRI, depth + 1);
+ }
+ }
+
+ MRT *getEntryTree() { return Children.back(); }
+
+ MRT *getExitTree() { return Children.front(); }
+
+ MachineBasicBlock *getEntry() {
+ MRT *Tree = Children.back();
+ return (Tree->isRegion()) ? Tree->getRegionMRT()->getEntry()
+ : Tree->getMBBMRT()->getMBB();
+ }
+
+ MachineBasicBlock *getExit() {
+ MRT *Tree = Children.front();
+ return (Tree->isRegion()) ? Tree->getRegionMRT()->getExit()
+ : Tree->getMBBMRT()->getMBB();
+ }
+
+ void setSucc(MachineBasicBlock *MBB) { Succ = MBB; }
+
+ MachineBasicBlock *getSucc() { return Succ; }
+
+ bool contains(MachineBasicBlock *MBB) {
+ for (auto CI : Children) {
+ if (CI->isMBB()) {
+ if (MBB == CI->getMBBMRT()->getMBB()) {
+ return true;
+ }
+ } else {
+ if (CI->getRegionMRT()->contains(MBB)) {
+ return true;
+ } else if (CI->getRegionMRT()->getLinearizedRegion() != nullptr &&
+ CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ void replaceLiveOutReg(unsigned Register, unsigned NewRegister) {
+ LinearizedRegion *LRegion = getLinearizedRegion();
+ LRegion->replaceLiveOut(Register, NewRegister);
+ for (auto &CI : Children) {
+ if (CI->isRegion()) {
+ CI->getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
+ }
+ }
+ }
+
+ RegionMRT(MachineRegion *MachineRegion)
+ : Region(MachineRegion), LRegion(nullptr), Succ(nullptr) {
+ setParent(nullptr);
+ setBBSelectRegOut(0);
+ setBBSelectRegIn(0);
+ }
+
+ virtual ~RegionMRT() {
+ if (LRegion) {
+ delete LRegion;
+ }
+
+ for (auto CI : Children) {
+ delete &(*CI);
+ }
+ }
+};
+
+static unsigned createBBSelectReg(const SIInstrInfo *TII,
+ MachineRegisterInfo *MRI) {
+ return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32));
+}
+
+MachineBasicBlock *
+MRT::initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
+ DenseMap<MachineRegion *, RegionMRT *> &RegionMap) {
+ for (auto &MFI : MF) {
+ MachineBasicBlock *ExitMBB = &MFI;
+ if (ExitMBB->succ_size() == 0) {
+ return ExitMBB;
+ }
+ }
+ llvm_unreachable("CFG has no exit block");
+ return nullptr;
+}
+
+RegionMRT *MRT::buildMRT(MachineFunction &MF,
+ const MachineRegionInfo *RegionInfo,
+ const SIInstrInfo *TII, MachineRegisterInfo *MRI) {
+ SmallPtrSet<MachineRegion *, 4> PlacedRegions;
+ DenseMap<MachineRegion *, RegionMRT *> RegionMap;
+ MachineRegion *TopLevelRegion = RegionInfo->getTopLevelRegion();
+ RegionMRT *Result = new RegionMRT(TopLevelRegion);
+ RegionMap[TopLevelRegion] = Result;
+
+ // Insert the exit block first, we need it to be the merge node
+ // for the top level region.
+ MachineBasicBlock *Exit = initializeMRT(MF, RegionInfo, RegionMap);
+
+ unsigned BBSelectRegIn = createBBSelectReg(TII, MRI);
+ MBBMRT *ExitMRT = new MBBMRT(Exit);
+ RegionMap[RegionInfo->getRegionFor(Exit)]->addChild(ExitMRT);
+ ExitMRT->setBBSelectRegIn(BBSelectRegIn);
+
+ for (auto MBBI : post_order(&(MF.front()))) {
+ MachineBasicBlock *MBB = &(*MBBI);
+
+ // Skip Exit since we already added it
+ if (MBB == Exit) {
+ continue;
+ }
+
+ DEBUG(dbgs() << "Visiting BB#" << MBB->getNumber() << "\n");
+ MBBMRT *NewMBB = new MBBMRT(MBB);
+ MachineRegion *Region = RegionInfo->getRegionFor(MBB);
+
+ // Ensure we have the MRT region
+ if (RegionMap.count(Region) == 0) {
+ RegionMRT *NewMRTRegion = new RegionMRT(Region);
+ RegionMap[Region] = NewMRTRegion;
+
+ // Ensure all parents are in the RegionMap
+ MachineRegion *Parent = Region->getParent();
+ while (RegionMap.count(Parent) == 0) {
+ RegionMRT *NewMRTParent = new RegionMRT(Parent);
+ NewMRTParent->addChild(NewMRTRegion);
+ NewMRTRegion->setParent(NewMRTParent);
+ RegionMap[Parent] = NewMRTParent;
+ NewMRTRegion = NewMRTParent;
+ Parent = Parent->getParent();
+ }
+ RegionMap[Parent]->addChild(NewMRTRegion);
+ NewMRTRegion->setParent(RegionMap[Parent]);
+ }
+
+ // Add MBB to Region MRT
+ RegionMap[Region]->addChild(NewMBB);
+ NewMBB->setParent(RegionMap[Region]);
+ RegionMap[Region]->setSucc(Region->getExit());
+ }
+ return Result;
+}
+
+void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+ MachineInstr *DefInstr,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo) {
+ if (TRI->isVirtualRegister(Reg)) {
+ DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+ // If this is a source register to a PHI we are chaining, it
+ // must be live out.
+ if (PHIInfo.isSource(Reg)) {
+ DEBUG(dbgs() << "Add LiveOut (PHI): " << PrintReg(Reg, TRI) << "\n");
+ addLiveOut(Reg);
+ } else {
+ // If this is live out of the MBB
+ for (auto &UI : MRI->use_operands(Reg)) {
+ if (UI.getParent()->getParent() != MBB) {
+ DEBUG(dbgs() << "Add LiveOut (MBB BB#" << MBB->getNumber()
+ << "): " << PrintReg(Reg, TRI) << "\n");
+ addLiveOut(Reg);
+ } else {
+ // If the use is in the same MBB we have to make sure
+ // it is after the def, otherwise it is live out in a loop
+ MachineInstr *UseInstr = UI.getParent();
+ for (MachineBasicBlock::instr_iterator
+ MII = UseInstr->getIterator(),
+ MIE = UseInstr->getParent()->instr_end();
+ MII != MIE; ++MII) {
+ if ((&(*MII)) == DefInstr) {
+ DEBUG(dbgs() << "Add LiveOut (Loop): " << PrintReg(Reg, TRI)
+ << "\n");
+ addLiveOut(Reg);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+ MachineInstr *DefInstr,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo) {
+ if (TRI->isVirtualRegister(Reg)) {
+ DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+ for (auto &UI : MRI->use_operands(Reg)) {
+ if (!Region->contains(UI.getParent()->getParent())) {
+ DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
+ << "): " << PrintReg(Reg, TRI) << "\n");
+ addLiveOut(Reg);
+ }
+ }
+ }
+}
+
+void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo) {
+ DEBUG(dbgs() << "-Store Live Outs Begin (BB#" << MBB->getNumber() << ")-\n");
+ for (auto &II : *MBB) {
+ for (auto &RI : II.defs()) {
+ storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo);
+ }
+ for (auto &IRI : II.implicit_operands()) {
+ if (IRI.isDef()) {
+ storeLiveOutReg(MBB, IRI.getReg(), IRI.getParent(), MRI, TRI, PHIInfo);
+ }
+ }
+ }
+
+ // If we have a successor with a PHI, source coming from this MBB we have to
+ // add the register as live out
+ for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+ E = MBB->succ_end();
+ SI != E; ++SI) {
+ for (auto &II : *(*SI)) {
+ if (II.isPHI()) {
+ MachineInstr &PHI = II;
+ int numPreds = getPHINumInputs(PHI);
+ for (int i = 0; i < numPreds; ++i) {
+ if (getPHIPred(PHI, i) == MBB) {
+ unsigned PHIReg = getPHISourceReg(PHI, i);
+ DEBUG(dbgs() << "Add LiveOut (PhiSource BB#" << MBB->getNumber()
+ << " -> BB#" << (*SI)->getNumber()
+ << "): " << PrintReg(PHIReg, TRI) << "\n");
+ addLiveOut(PHIReg);
+ }
+ }
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "-Store Live Outs Endn-\n");
+}
+
+void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo,
+ RegionMRT *TopRegion) {
+ for (auto &II : *MBB) {
+ for (auto &RI : II.defs()) {
+ storeLiveOutRegRegion(TopRegion, RI.getReg(), RI.getParent(), MRI, TRI,
+ PHIInfo);
+ }
+ for (auto &IRI : II.implicit_operands()) {
+ if (IRI.isDef()) {
+ storeLiveOutRegRegion(TopRegion, IRI.getReg(), IRI.getParent(), MRI,
+ TRI, PHIInfo);
+ }
+ }
+ }
+}
+
+void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo,
+ RegionMRT *CurrentTopRegion) {
+ MachineBasicBlock *Exit = Region->getSucc();
+
+ RegionMRT *TopRegion =
+ CurrentTopRegion == nullptr ? Region : CurrentTopRegion;
+
+ // Check if exit is end of function, if so, no live outs.
+ if (Exit == nullptr)
+ return;
+
+ auto Children = Region->getChildren();
+ for (auto CI : *Children) {
+ if (CI->isMBB()) {
+ auto MBB = CI->getMBBMRT()->getMBB();
+ storeMBBLiveOuts(MBB, MRI, TRI, PHIInfo, TopRegion);
+ } else {
+ LinearizedRegion *SubRegion = CI->getRegionMRT()->getLinearizedRegion();
+ // We should be limited to only store registers that are live out from the
+ // lineaized region
+ for (auto MBBI : SubRegion->MBBs) {
+ storeMBBLiveOuts(MBBI, MRI, TRI, PHIInfo, TopRegion);
+ }
+ }
+ }
+
+ if (CurrentTopRegion == nullptr) {
+ auto Succ = Region->getSucc();
+ for (auto &II : *Succ) {
+ if (II.isPHI()) {
+ MachineInstr &PHI = II;
+ int numPreds = getPHINumInputs(PHI);
+ for (int i = 0; i < numPreds; ++i) {
+ if (Region->contains(getPHIPred(PHI, i))) {
+ unsigned PHIReg = getPHISourceReg(PHI, i);
+ DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
+ << "): " << PrintReg(PHIReg, TRI) << "\n");
+ addLiveOut(PHIReg);
+ }
+ }
+ }
+ }
+ }
+}
+
+void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
+ OS << "Linearized Region {";
+ bool IsFirst = true;
+ for (const auto &MBB : MBBs) {
+ if (IsFirst) {
+ IsFirst = false;
+ } else {
+ OS << " ,";
+ }
+ OS << MBB->getNumber();
+ }
+ OS << "} (" << Entry->getNumber() << ", "
+ << (Exit == nullptr ? -1 : Exit->getNumber())
+ << "): In:" << PrintReg(getBBSelectRegIn(), TRI)
+ << " Out:" << PrintReg(getBBSelectRegOut(), TRI) << " {";
+ for (auto &LI : LiveOuts) {
+ OS << PrintReg(LI, TRI) << " ";
+ }
+ OS << "} \n";
+}
+
+unsigned LinearizedRegion::getBBSelectRegIn() {
+ return getRegionMRT()->getBBSelectRegIn();
+}
+
+unsigned LinearizedRegion::getBBSelectRegOut() {
+ return getRegionMRT()->getBBSelectRegOut();
+}
+
+void LinearizedRegion::setHasLoop(bool Value) { HasLoop = Value; }
+
+bool LinearizedRegion::getHasLoop() { return HasLoop; }
+
+void LinearizedRegion::addLiveOut(unsigned VReg) { LiveOuts.insert(VReg); }
+
+void LinearizedRegion::removeLiveOut(unsigned Reg) {
+ if (isLiveOut(Reg))
+ LiveOuts.erase(Reg);
+}
+
+void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) {
+ if (isLiveOut(OldReg)) {
+ removeLiveOut(OldReg);
+ addLiveOut(NewReg);
+ }
+}
+
+void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
+ MachineRegisterInfo *MRI,
+ bool ReplaceInside, bool ReplaceOutside,
+ bool IncludeLoopPHI) {
+ assert(Register != NewRegister && "Cannot replace a reg with itself");
+
+ DEBUG(dbgs() << "Pepareing to replace register (region): "
+ << PrintReg(Register, MRI->getTargetRegisterInfo()) << " with "
+ << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
+
+ // If we are replacing outside, we also need to update the LiveOuts
+ if (ReplaceOutside &&
+ (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) {
+ LinearizedRegion *Current = this;
+ while (Current != nullptr && Current->getEntry() != nullptr) {
+ DEBUG(dbgs() << "Region before register replace\n");
+ DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+ Current->replaceLiveOut(Register, NewRegister);
+ DEBUG(dbgs() << "Region after register replace\n");
+ DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+ Current = Current->getParent();
+ }
+ }
+
+ for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
+ E = MRI->reg_end();
+ I != E;) {
+ MachineOperand &O = *I;
+ ++I;
+
+ // We don't rewrite defs.
+ if (O.isDef())
+ continue;
+
+ bool IsInside = contains(O.getParent()->getParent());
+ bool IsLoopPHI = IsInside && (O.getParent()->isPHI() &&
+ O.getParent()->getParent() == getEntry());
+ bool ShouldReplace = (IsInside && ReplaceInside) ||
+ (!IsInside && ReplaceOutside) ||
+ (IncludeLoopPHI && IsLoopPHI);
+ if (ShouldReplace) {
+
+ if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+ DEBUG(dbgs() << "Trying to substitute physical register: "
+ << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
+ llvm_unreachable("Cannot substitute physical registers");
+ } else {
+ DEBUG(dbgs() << "Replacing register (region): "
+ << PrintReg(Register, MRI->getTargetRegisterInfo())
+ << " with "
+ << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
+ O.setReg(NewRegister);
+ }
+ }
+ }
+}
+
+void LinearizedRegion::replaceRegisterInsideRegion(unsigned Register,
+ unsigned NewRegister,
+ bool IncludeLoopPHIs,
+ MachineRegisterInfo *MRI) {
+ replaceRegister(Register, NewRegister, MRI, true, false, IncludeLoopPHIs);
+}
+
+void LinearizedRegion::replaceRegisterOutsideRegion(unsigned Register,
+ unsigned NewRegister,
+ bool IncludeLoopPHIs,
+ MachineRegisterInfo *MRI) {
+ replaceRegister(Register, NewRegister, MRI, false, true, IncludeLoopPHIs);
+}
+
+DenseSet<unsigned> *LinearizedRegion::getLiveOuts() { return &LiveOuts; }
+
+void LinearizedRegion::setEntry(MachineBasicBlock *NewEntry) {
+ Entry = NewEntry;
+}
+
+MachineBasicBlock *LinearizedRegion::getEntry() { return Entry; }
+
+void LinearizedRegion::setExit(MachineBasicBlock *NewExit) { Exit = NewExit; }
+
+MachineBasicBlock *LinearizedRegion::getExit() { return Exit; }
+
+void LinearizedRegion::addMBB(MachineBasicBlock *MBB) { MBBs.insert(MBB); }
+
+void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) {
+ for (const auto &MBB : InnerRegion->MBBs) {
+ addMBB(MBB);
+ }
+}
+
+bool LinearizedRegion::contains(MachineBasicBlock *MBB) {
+ return MBBs.count(MBB) == 1;
+}
+
+bool LinearizedRegion::isLiveOut(unsigned Reg) {
+ return LiveOuts.count(Reg) == 1;
+}
+
+bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) {
+ return MRI->def_begin(Reg) == MRI->def_end();
+}
+
+// After the code has been structurized, what was flagged as kills
+// before are no longer register kills.
+void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
+ const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+ for (auto MBBI : MBBs) {
+ MachineBasicBlock *MBB = MBBI;
+ for (auto &II : *MBB) {
+ for (auto &RI : II.uses()) {
+ if (RI.isReg()) {
+ unsigned Reg = RI.getReg();
+ if (TRI->isVirtualRegister(Reg)) {
+ if (hasNoDef(Reg, MRI))
+ continue;
+ if (!MRI->hasOneDef(Reg)) {
+ DEBUG(this->getEntry()->getParent()->dump());
+ DEBUG(dbgs() << PrintReg(Reg, TRI) << "\n");
+ }
+
+ if (MRI->def_begin(Reg) == MRI->def_end()) {
+ DEBUG(dbgs() << "Register "
+ << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ << " has NO defs\n");
+ } else if (!MRI->hasOneDef(Reg)) {
+ DEBUG(dbgs() << "Register "
+ << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ << " has multiple defs\n");
+ }
+
+ assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
+ MachineOperand *Def = &(*(MRI->def_begin(Reg)));
+ MachineOperand *UseOperand = &(RI);
+ bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB;
+ if (UseIsOutsideDefMBB && UseOperand->isKill()) {
+ DEBUG(dbgs() << "Removing kill flag on register: "
+ << PrintReg(Reg, TRI) << "\n");
+ UseOperand->setIsKill(false);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void LinearizedRegion::initLiveOut(RegionMRT *Region,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo) {
+ storeLiveOuts(Region, MRI, TRI, PHIInfo);
+}
+
+LinearizedRegion::LinearizedRegion(MachineBasicBlock *MBB,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI,
+ PHILinearize &PHIInfo) {
+ setEntry(MBB);
+ setExit(MBB);
+ storeLiveOuts(MBB, MRI, TRI, PHIInfo);
+ MBBs.insert(MBB);
+ Parent = nullptr;
+}
+
+LinearizedRegion::LinearizedRegion() {
+ setEntry(nullptr);
+ setExit(nullptr);
+ Parent = nullptr;
+}
+
+LinearizedRegion::~LinearizedRegion() {}
+
+class AMDGPUMachineCFGStructurizer : public MachineFunctionPass {
+private:
+ const MachineRegionInfo *Regions;
+ const SIInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+ unsigned BBSelectRegister;
+ PHILinearize PHIInfo;
+ DenseMap<MachineBasicBlock *, MachineBasicBlock *> FallthroughMap;
+
+ void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &RegionIndices);
+ void getPHIRegionIndices(LinearizedRegion *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &RegionIndices);
+ void getPHINonRegionIndices(LinearizedRegion *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHINonRegionIndices);
+
+ void storePHILinearizationInfoDest(
+ unsigned LDestReg, MachineInstr &PHI,
+ SmallVector<unsigned, 2> *RegionIndices = nullptr);
+
+ unsigned storePHILinearizationInfo(MachineInstr &PHI,
+ SmallVector<unsigned, 2> *RegionIndices);
+
+ void extractKilledPHIs(MachineBasicBlock *MBB);
+
+ bool shrinkPHI(MachineInstr &PHI, SmallVector<unsigned, 2> &PHIIndices,
+ unsigned *ReplaceReg);
+
+ bool shrinkPHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+ MachineBasicBlock *SourceMBB,
+ SmallVector<unsigned, 2> &PHIIndices, unsigned *ReplaceReg);
+
+ void replacePHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+ MachineBasicBlock *LastMerge,
+ SmallVector<unsigned, 2> &PHIRegionIndices);
+ void replaceEntryPHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+ MachineBasicBlock *IfMBB,
+ SmallVector<unsigned, 2> &PHIRegionIndices);
+ void replaceLiveOutRegs(MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHIRegionIndices,
+ unsigned CombinedSourceReg,
+ LinearizedRegion *LRegion);
+ void rewriteRegionExitPHI(RegionMRT *Region, MachineBasicBlock *LastMerge,
+ MachineInstr &PHI, LinearizedRegion *LRegion);
+
+ void rewriteRegionExitPHIs(RegionMRT *Region, MachineBasicBlock *LastMerge,
+ LinearizedRegion *LRegion);
+ void rewriteRegionEntryPHI(LinearizedRegion *Region, MachineBasicBlock *IfMBB,
+ MachineInstr &PHI);
+ void rewriteRegionEntryPHIs(LinearizedRegion *Region,
+ MachineBasicBlock *IfMBB);
+
+ bool regionIsSimpleIf(RegionMRT *Region);
+
+ void transformSimpleIfRegion(RegionMRT *Region);
+
+ void eliminateDeadBranchOperands(MachineBasicBlock::instr_iterator &II);
+
+ void insertUnconditionalBranch(MachineBasicBlock *MBB,
+ MachineBasicBlock *Dest,
+ const DebugLoc &DL = DebugLoc());
+
+ MachineBasicBlock *createLinearizedExitBlock(RegionMRT *Region);
+
+ void insertMergePHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB, unsigned DestRegister,
+ unsigned IfSourceRegister, unsigned CodeSourceRegister,
+ bool IsUndefIfSource = false);
+
+ MachineBasicBlock *createIfBlock(MachineBasicBlock *MergeBB,
+ MachineBasicBlock *CodeBBStart,
+ MachineBasicBlock *CodeBBEnd,
+ MachineBasicBlock *SelectBB, unsigned IfReg,
+ bool InheritPreds);
+
+ void prunePHIInfo(MachineBasicBlock *MBB);
+ void createEntryPHI(LinearizedRegion *CurrentRegion, unsigned DestReg);
+
+ void createEntryPHIs(LinearizedRegion *CurrentRegion);
+ void resolvePHIInfos(MachineBasicBlock *FunctionEntry);
+
+ void replaceRegisterWith(unsigned Register, unsigned NewRegister);
+
+ MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB,
+ MachineBasicBlock *CodeBB,
+ LinearizedRegion *LRegion,
+ unsigned BBSelectRegIn,
+ unsigned BBSelectRegOut);
+
+ MachineBasicBlock *
+ createIfRegion(MachineBasicBlock *MergeMBB, LinearizedRegion *InnerRegion,
+ LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB,
+ unsigned BBSelectRegIn, unsigned BBSelectRegOut);
+ void ensureCondIsNotKilled(SmallVector<MachineOperand, 1> Cond);
+
+ void rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ unsigned BBSelectReg);
+
+ MachineInstr *getDefInstr(unsigned Reg);
+ void insertChainedPHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ LinearizedRegion *InnerRegion, unsigned DestReg,
+ unsigned SourceReg);
+ bool containsDef(MachineBasicBlock *MBB, LinearizedRegion *InnerRegion,
+ unsigned Register);
+ void rewriteLiveOutRegs(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ LinearizedRegion *InnerRegion,
+ LinearizedRegion *LRegion);
+
+ void splitLoopPHI(MachineInstr &PHI, MachineBasicBlock *Entry,
+ MachineBasicBlock *EntrySucc, LinearizedRegion *LRegion);
+ void splitLoopPHIs(MachineBasicBlock *Entry, MachineBasicBlock *EntrySucc,
+ LinearizedRegion *LRegion);
+
+ MachineBasicBlock *splitExit(LinearizedRegion *LRegion);
+
+ MachineBasicBlock *splitEntry(LinearizedRegion *LRegion);
+
+ LinearizedRegion *initLinearizedRegion(RegionMRT *Region);
+
+ bool structurizeComplexRegion(RegionMRT *Region);
+
+ bool structurizeRegion(RegionMRT *Region);
+
+ bool structurizeRegions(RegionMRT *Region, bool isTopRegion);
+
+public:
+ static char ID;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineRegionInfoPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) {
+ initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ void initFallthroughMap(MachineFunction &MF);
+
+ void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut);
+
+ unsigned initializeSelectRegisters(MRT *MRT, unsigned ExistingExitReg,
+ MachineRegisterInfo *MRI,
+ const SIInstrInfo *TII);
+
+ RegionMRT *RMRT;
+ void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; }
+
+ RegionMRT *getRegionMRT() { return RMRT; }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+char AMDGPUMachineCFGStructurizer::ID = 0;
+
+bool AMDGPUMachineCFGStructurizer::regionIsSimpleIf(RegionMRT *Region) {
+ MachineBasicBlock *Entry = Region->getEntry();
+ MachineBasicBlock *Succ = Region->getSucc();
+ bool FoundBypass = false;
+ bool FoundIf = false;
+
+ if (Entry->succ_size() != 2) {
+ return false;
+ }
+
+ for (MachineBasicBlock::const_succ_iterator SI = Entry->succ_begin(),
+ E = Entry->succ_end();
+ SI != E; ++SI) {
+ MachineBasicBlock *Current = *SI;
+
+ if (Current == Succ) {
+ FoundBypass = true;
+ } else if ((Current->succ_size() == 1) &&
+ *(Current->succ_begin()) == Succ) {
+ FoundIf = true;
+ }
+ }
+
+ return FoundIf && FoundBypass;
+}
+
+void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) {
+ MachineBasicBlock *Entry = Region->getEntry();
+ MachineBasicBlock *Exit = Region->getExit();
+ TII->convertNonUniformIfRegion(Entry, Exit);
+}
+
+static void fixMBBTerminator(MachineBasicBlock *MBB) {
+
+ if (MBB->succ_size() == 1) {
+ auto *Succ = *(MBB->succ_begin());
+ for (auto &TI : MBB->terminators()) {
+ for (auto &UI : TI.uses()) {
+ if (UI.isMBB() && UI.getMBB() != Succ) {
+ UI.setMBB(Succ);
+ }
+ }
+ }
+ }
+}
+
+static void fixRegionTerminator(RegionMRT *Region) {
+ MachineBasicBlock *InternalSucc = nullptr;
+ MachineBasicBlock *ExternalSucc = nullptr;
+ LinearizedRegion *LRegion = Region->getLinearizedRegion();
+ auto Exit = LRegion->getExit();
+
+ SmallPtrSet<MachineBasicBlock *, 2> Successors;
+ for (MachineBasicBlock::const_succ_iterator SI = Exit->succ_begin(),
+ SE = Exit->succ_end();
+ SI != SE; ++SI) {
+ MachineBasicBlock *Succ = *SI;
+ if (LRegion->contains(Succ)) {
+ // Do not allow re-assign
+ assert(InternalSucc == nullptr);
+ InternalSucc = Succ;
+ } else {
+ // Do not allow re-assign
+ assert(ExternalSucc == nullptr);
+ ExternalSucc = Succ;
+ }
+ }
+
+ for (auto &TI : Exit->terminators()) {
+ for (auto &UI : TI.uses()) {
+ if (UI.isMBB()) {
+ auto Target = UI.getMBB();
+ if (Target != InternalSucc && Target != ExternalSucc) {
+ UI.setMBB(ExternalSucc);
+ }
+ }
+ }
+ }
+}
+
+// If a region region is just a sequence of regions (and the exit
+// block in the case of the top level region), we can simply skip
+// linearizing it, because it is already linear
+bool regionIsSequence(RegionMRT *Region) {
+ auto Children = Region->getChildren();
+ for (auto CI : *Children) {
+ if (!CI->isRegion()) {
+ if (CI->getMBBMRT()->getMBB()->succ_size() > 1) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+void fixupRegionExits(RegionMRT *Region) {
+ auto Children = Region->getChildren();
+ for (auto CI : *Children) {
+ if (!CI->isRegion()) {
+ fixMBBTerminator(CI->getMBBMRT()->getMBB());
+ } else {
+ fixRegionTerminator(CI->getRegionMRT());
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHIRegionIndices(
+ RegionMRT *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHIRegionIndices) {
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ MachineBasicBlock *Pred = getPHIPred(PHI, i);
+ if (Region->contains(Pred)) {
+ PHIRegionIndices.push_back(i);
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHIRegionIndices(
+ LinearizedRegion *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHIRegionIndices) {
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ MachineBasicBlock *Pred = getPHIPred(PHI, i);
+ if (Region->contains(Pred)) {
+ PHIRegionIndices.push_back(i);
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHINonRegionIndices(
+ LinearizedRegion *Region, MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHINonRegionIndices) {
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ MachineBasicBlock *Pred = getPHIPred(PHI, i);
+ if (!Region->contains(Pred)) {
+ PHINonRegionIndices.push_back(i);
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest(
+ unsigned LDestReg, MachineInstr &PHI,
+ SmallVector<unsigned, 2> *RegionIndices) {
+ if (RegionIndices) {
+ for (auto i : *RegionIndices) {
+ PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i));
+ }
+ } else {
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i));
+ }
+ }
+}
+
+unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo(
+ MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) {
+ unsigned DestReg = getPHIDestReg(PHI);
+ unsigned LinearizeDestReg =
+ MRI->createVirtualRegister(MRI->getRegClass(DestReg));
+ PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc());
+ storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices);
+ return LinearizeDestReg;
+}
+
+void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
+ // We need to create a new chain for the killed phi, but there is no
+ // need to do the renaming outside or inside the block.
+ SmallPtrSet<MachineInstr *, 2> PHIs;
+ for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+ E = MBB->instr_end();
+ I != E; ++I) {
+ MachineInstr &Instr = *I;
+ if (Instr.isPHI()) {
+ unsigned PHIDestReg = getPHIDestReg(Instr);
+ DEBUG(dbgs() << "Extractking killed phi:\n");
+ DEBUG(Instr.dump());
+ PHIs.insert(&Instr);
+ PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc());
+ storePHILinearizationInfoDest(PHIDestReg, Instr);
+ }
+ }
+
+ for (auto PI : PHIs) {
+ PI->eraseFromParent();
+ }
+}
+
+static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices,
+ unsigned Index) {
+ for (auto i : PHIRegionIndices) {
+ if (i == Index)
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
+ SmallVector<unsigned, 2> &PHIIndices,
+ unsigned *ReplaceReg) {
+ return shrinkPHI(PHI, 0, nullptr, PHIIndices, ReplaceReg);
+}
+
+bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
+ unsigned CombinedSourceReg,
+ MachineBasicBlock *SourceMBB,
+ SmallVector<unsigned, 2> &PHIIndices,
+ unsigned *ReplaceReg) {
+ DEBUG(dbgs() << "Shrink PHI: ");
+ DEBUG(PHI.dump());
+ DEBUG(dbgs() << " to " << PrintReg(getPHIDestReg(PHI), TRI)
+ << "<def> = PHI(");
+
+ bool Replaced = false;
+ unsigned NumInputs = getPHINumInputs(PHI);
+ int SingleExternalEntryIndex = -1;
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (!isPHIRegionIndex(PHIIndices, i)) {
+ if (SingleExternalEntryIndex == -1) {
+ // Single entry
+ SingleExternalEntryIndex = i;
+ } else {
+ // Multiple entries
+ SingleExternalEntryIndex = -2;
+ }
+ }
+ }
+
+ if (SingleExternalEntryIndex > -1) {
+ *ReplaceReg = getPHISourceReg(PHI, SingleExternalEntryIndex);
+ // We should not rewrite the code, we should only pick up the single value
+ // that represents the shrunk PHI.
+ Replaced = true;
+ } else {
+ MachineBasicBlock *MBB = PHI.getParent();
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+ getPHIDestReg(PHI));
+ if (SourceMBB) {
+ MIB.addReg(CombinedSourceReg);
+ MIB.addMBB(SourceMBB);
+ DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+ << SourceMBB->getNumber());
+ }
+
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (isPHIRegionIndex(PHIIndices, i)) {
+ continue;
+ }
+ unsigned SourceReg = getPHISourceReg(PHI, i);
+ MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+ MIB.addReg(SourceReg);
+ MIB.addMBB(SourcePred);
+ DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+ << SourcePred->getNumber());
+ }
+ DEBUG(dbgs() << ")\n");
+ }
+ PHI.eraseFromParent();
+ return Replaced;
+}
+
+void AMDGPUMachineCFGStructurizer::replacePHI(
+ MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge,
+ SmallVector<unsigned, 2> &PHIRegionIndices) {
+ DEBUG(dbgs() << "Replace PHI: ");
+ DEBUG(PHI.dump());
+ DEBUG(dbgs() << " with " << PrintReg(getPHIDestReg(PHI), TRI)
+ << "<def> = PHI(");
+
+ bool HasExternalEdge = false;
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (!isPHIRegionIndex(PHIRegionIndices, i)) {
+ HasExternalEdge = true;
+ }
+ }
+
+ if (HasExternalEdge) {
+ MachineBasicBlock *MBB = PHI.getParent();
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+ getPHIDestReg(PHI));
+ MIB.addReg(CombinedSourceReg);
+ MIB.addMBB(LastMerge);
+ DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+ << LastMerge->getNumber());
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (isPHIRegionIndex(PHIRegionIndices, i)) {
+ continue;
+ }
+ unsigned SourceReg = getPHISourceReg(PHI, i);
+ MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+ MIB.addReg(SourceReg);
+ MIB.addMBB(SourcePred);
+ DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+ << SourcePred->getNumber());
+ }
+ DEBUG(dbgs() << ")\n");
+ } else {
+ replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg);
+ }
+ PHI.eraseFromParent();
+}
+
+void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
+ MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB,
+ SmallVector<unsigned, 2> &PHIRegionIndices) {
+
+ DEBUG(dbgs() << "Replace entry PHI: ");
+ DEBUG(PHI.dump());
+ DEBUG(dbgs() << " with ");
+
+ unsigned NumInputs = getPHINumInputs(PHI);
+ unsigned NumNonRegionInputs = NumInputs;
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (isPHIRegionIndex(PHIRegionIndices, i)) {
+ NumNonRegionInputs--;
+ }
+ }
+
+ if (NumNonRegionInputs == 0) {
+ auto DestReg = getPHIDestReg(PHI);
+ replaceRegisterWith(DestReg, CombinedSourceReg);
+ DEBUG(dbgs() << " register " << PrintReg(CombinedSourceReg, TRI) << "\n");
+ PHI.eraseFromParent();
+ } else {
+ DEBUG(dbgs() << PrintReg(getPHIDestReg(PHI), TRI) << "<def> = PHI(");
+ MachineBasicBlock *MBB = PHI.getParent();
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+ getPHIDestReg(PHI));
+ MIB.addReg(CombinedSourceReg);
+ MIB.addMBB(IfMBB);
+ DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+ << IfMBB->getNumber());
+ unsigned NumInputs = getPHINumInputs(PHI);
+ for (unsigned i = 0; i < NumInputs; ++i) {
+ if (isPHIRegionIndex(PHIRegionIndices, i)) {
+ continue;
+ }
+ unsigned SourceReg = getPHISourceReg(PHI, i);
+ MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+ MIB.addReg(SourceReg);
+ MIB.addMBB(SourcePred);
+ DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+ << SourcePred->getNumber());
+ }
+ DEBUG(dbgs() << ")\n");
+ PHI.eraseFromParent();
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
+ MachineInstr &PHI, SmallVector<unsigned, 2> &PHIRegionIndices,
+ unsigned CombinedSourceReg, LinearizedRegion *LRegion) {
+ bool WasLiveOut = false;
+ for (auto PII : PHIRegionIndices) {
+ unsigned Reg = getPHISourceReg(PHI, PII);
+ if (LRegion->isLiveOut(Reg)) {
+ bool IsDead = true;
+
+ // Check if register is live out of the basic block
+ MachineBasicBlock *DefMBB = getDefInstr(Reg)->getParent();
+ for (auto UI = MRI->use_begin(Reg), E = MRI->use_end(); UI != E; ++UI) {
+ if ((*UI).getParent()->getParent() != DefMBB) {
+ IsDead = false;
+ }
+ }
+
+ DEBUG(dbgs() << "Register " << PrintReg(Reg, TRI) << " is "
+ << (IsDead ? "dead" : "alive") << " after PHI replace\n");
+ if (IsDead) {
+ LRegion->removeLiveOut(Reg);
+ }
+ WasLiveOut = true;
+ }
+ }
+
+ if (WasLiveOut)
+ LRegion->addLiveOut(CombinedSourceReg);
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHI(RegionMRT *Region,
+ MachineBasicBlock *LastMerge,
+ MachineInstr &PHI,
+ LinearizedRegion *LRegion) {
+ SmallVector<unsigned, 2> PHIRegionIndices;
+ getPHIRegionIndices(Region, PHI, PHIRegionIndices);
+ unsigned LinearizedSourceReg =
+ storePHILinearizationInfo(PHI, &PHIRegionIndices);
+
+ replacePHI(PHI, LinearizedSourceReg, LastMerge, PHIRegionIndices);
+ replaceLiveOutRegs(PHI, PHIRegionIndices, LinearizedSourceReg, LRegion);
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHI(LinearizedRegion *Region,
+ MachineBasicBlock *IfMBB,
+ MachineInstr &PHI) {
+ SmallVector<unsigned, 2> PHINonRegionIndices;
+ getPHINonRegionIndices(Region, PHI, PHINonRegionIndices);
+ unsigned LinearizedSourceReg =
+ storePHILinearizationInfo(PHI, &PHINonRegionIndices);
+ replaceEntryPHI(PHI, LinearizedSourceReg, IfMBB, PHINonRegionIndices);
+}
+
+static void collectPHIs(MachineBasicBlock *MBB,
+ SmallVector<MachineInstr *, 2> &PHIs) {
+ for (auto &BBI : *MBB) {
+ if (BBI.isPHI()) {
+ PHIs.push_back(&BBI);
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHIs(RegionMRT *Region,
+ MachineBasicBlock *LastMerge,
+ LinearizedRegion *LRegion) {
+ SmallVector<MachineInstr *, 2> PHIs;
+ auto Exit = Region->getSucc();
+ if (Exit == nullptr)
+ return;
+
+ collectPHIs(Exit, PHIs);
+
+ for (auto PHII : PHIs) {
+ rewriteRegionExitPHI(Region, LastMerge, *PHII, LRegion);
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Region,
+ MachineBasicBlock *IfMBB) {
+ SmallVector<MachineInstr *, 2> PHIs;
+ auto Entry = Region->getEntry();
+
+ collectPHIs(Entry, PHIs);
+
+ for (auto PHII : PHIs) {
+ rewriteRegionEntryPHI(Region, IfMBB, *PHII);
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB,
+ MachineBasicBlock *Dest,
+ const DebugLoc &DL) {
+ DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
+ << " -> " << Dest->getNumber() << "\n");
+ MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator();
+ bool HasTerminator = Terminator != MBB->instr_end();
+ if (HasTerminator) {
+ TII->ReplaceTailWithBranchTo(Terminator, Dest);
+ }
+ if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(Dest)) {
+ TII->insertUnconditionalBranch(*MBB, Dest, DL);
+ }
+}
+
+static MachineBasicBlock *getSingleExitNode(MachineFunction &MF) {
+ MachineBasicBlock *result = nullptr;
+ for (auto &MFI : MF) {
+ if (MFI.succ_size() == 0) {
+ if (result == nullptr) {
+ result = &MFI;
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ return result;
+}
+
+static bool hasOneExitNode(MachineFunction &MF) {
+ return getSingleExitNode(MF) != nullptr;
+}
+
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) {
+ auto Exit = Region->getSucc();
+
+ // If the exit is the end of the function, we just use the existing
+ MachineFunction *MF = Region->getEntry()->getParent();
+ if (Exit == nullptr && hasOneExitNode(*MF)) {
+ return &(*(--(Region->getEntry()->getParent()->end())));
+ }
+
+ MachineBasicBlock *LastMerge = MF->CreateMachineBasicBlock();
+ if (Exit == nullptr) {
+ MachineFunction::iterator ExitIter = MF->end();
+ MF->insert(ExitIter, LastMerge);
+ } else {
+ MachineFunction::iterator ExitIter = Exit->getIterator();
+ MF->insert(ExitIter, LastMerge);
+ LastMerge->addSuccessor(Exit);
+ insertUnconditionalBranch(LastMerge, Exit);
+ DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n");
+ }
+ return LastMerge;
+}
+
+void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
+ MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ unsigned DestRegister,
+ unsigned IfSourceRegister,
+ unsigned CodeSourceRegister,
+ bool IsUndefIfSource) {
+ // If this is the function exit block, we don't need a phi.
+ if (MergeBB->succ_begin() == MergeBB->succ_end()) {
+ return;
+ }
+ DEBUG(dbgs() << "Merge PHI (BB#" << MergeBB->getNumber()
+ << "): " << PrintReg(DestRegister, TRI) << "<def> = PHI("
+ << PrintReg(IfSourceRegister, TRI) << ", BB#"
+ << IfBB->getNumber() << PrintReg(CodeSourceRegister, TRI)
+ << ", BB#" << CodeBB->getNumber() << ")\n");
+ const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin());
+ MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL,
+ TII->get(TargetOpcode::PHI), DestRegister);
+ if (IsUndefIfSource && false) {
+ MIB.addReg(IfSourceRegister, RegState::Undef);
+ } else {
+ MIB.addReg(IfSourceRegister);
+ }
+ MIB.addMBB(IfBB);
+ MIB.addReg(CodeSourceRegister);
+ MIB.addMBB(CodeBB);
+}
+
+static void removeExternalCFGSuccessors(MachineBasicBlock *MBB) {
+ for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(),
+ E = MBB->succ_end();
+ PI != E; ++PI) {
+ if ((*PI) != MBB) {
+ (MBB)->removeSuccessor(*PI);
+ }
+ }
+}
+
+static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
+ MachineBasicBlock *EndMBB) {
+
+ // We have to check against the StartMBB successor becasuse a
+ // structurized region with a loop will have the entry block split,
+ // and the backedge will go to the entry successor.
+ DenseSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Succs;
+ unsigned SuccSize = StartMBB->succ_size();
+ if (SuccSize > 0) {
+ MachineBasicBlock *StartMBBSucc = *(StartMBB->succ_begin());
+ for (MachineBasicBlock::succ_iterator PI = EndMBB->succ_begin(),
+ E = EndMBB->succ_end();
+ PI != E; ++PI) {
+ // Either we have a back-edge to the entry block, or a back-edge to the
+ // succesor of the entry block since the block may be split.
+ if ((*PI) != StartMBB &&
+ !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) {
+ Succs.insert(
+ std::pair<MachineBasicBlock *, MachineBasicBlock *>(EndMBB, *PI));
+ }
+ }
+ }
+
+ for (MachineBasicBlock::pred_iterator PI = StartMBB->pred_begin(),
+ E = StartMBB->pred_end();
+ PI != E; ++PI) {
+ if ((*PI) != EndMBB) {
+ Succs.insert(
+ std::pair<MachineBasicBlock *, MachineBasicBlock *>(*PI, StartMBB));
+ }
+ }
+
+ for (auto SI : Succs) {
+ std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
+ DEBUG(dbgs() << "Removing edge: BB#" << Edge.first->getNumber() << " -> BB#"
+ << Edge.second->getNumber() << "\n");
+ Edge.first->removeSuccessor(Edge.second);
+ }
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
+ MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBBStart,
+ MachineBasicBlock *CodeBBEnd, MachineBasicBlock *SelectBB, unsigned IfReg,
+ bool InheritPreds) {
+ MachineFunction *MF = MergeBB->getParent();
+ MachineBasicBlock *IfBB = MF->CreateMachineBasicBlock();
+
+ if (InheritPreds) {
+ for (MachineBasicBlock::pred_iterator PI = CodeBBStart->pred_begin(),
+ E = CodeBBStart->pred_end();
+ PI != E; ++PI) {
+ if ((*PI) != CodeBBEnd) {
+ MachineBasicBlock *Pred = (*PI);
+ Pred->addSuccessor(IfBB);
+ }
+ }
+ }
+
+ removeExternalCFGEdges(CodeBBStart, CodeBBEnd);
+
+ auto CodeBBStartI = CodeBBStart->getIterator();
+ auto CodeBBEndI = CodeBBEnd->getIterator();
+ auto MergeIter = MergeBB->getIterator();
+ MF->insert(MergeIter, IfBB);
+ MF->splice(MergeIter, CodeBBStartI, ++CodeBBEndI);
+ IfBB->addSuccessor(MergeBB);
+ IfBB->addSuccessor(CodeBBStart);
+
+ DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
+ // Ensure that the MergeBB is a succesor of the CodeEndBB.
+ if (!CodeBBEnd->isSuccessor(MergeBB))
+ CodeBBEnd->addSuccessor(MergeBB);
+
+ DEBUG(dbgs() << "Moved MBB#" << CodeBBStart->getNumber() << " through MBB#"
+ << CodeBBEnd->getNumber() << "\n");
+
+ // If we have a single predecessor we can find a reasonable debug location
+ MachineBasicBlock *SinglePred =
+ CodeBBStart->pred_size() == 1 ? *(CodeBBStart->pred_begin()) : nullptr;
+ const DebugLoc &DL = SinglePred
+ ? SinglePred->findDebugLoc(SinglePred->getFirstTerminator())
+ : DebugLoc();
+
+ unsigned Reg =
+ TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg,
+ SelectBB->getNumber() /* CodeBBStart->getNumber() */);
+ if (&(*(IfBB->getParent()->begin())) == IfBB) {
+ TII->materializeImmediate(*IfBB, IfBB->begin(), DL, IfReg,
+ CodeBBStart->getNumber());
+ }
+ MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
+ ArrayRef<MachineOperand> Cond(RegOp);
+ TII->insertBranch(*IfBB, MergeBB, CodeBBStart, Cond, DL);
+
+ return IfBB;
+}
+
+void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled(
+ SmallVector<MachineOperand, 1> Cond) {
+ if (Cond.size() != 1)
+ return;
+ if (!Cond[0].isReg())
+ return;
+
+ unsigned CondReg = Cond[0].getReg();
+ for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) {
+ (*UI).setIsKill(false);
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ unsigned BBSelectReg) {
+ MachineBasicBlock *TrueBB = nullptr;
+ MachineBasicBlock *FalseBB = nullptr;
+ SmallVector<MachineOperand, 1> Cond;
+ MachineBasicBlock *FallthroughBB = FallthroughMap[CodeBB];
+ TII->analyzeBranch(*CodeBB, TrueBB, FalseBB, Cond);
+
+ const DebugLoc &DL = CodeBB->findDebugLoc(CodeBB->getFirstTerminator());
+
+ if (FalseBB == nullptr && TrueBB == nullptr && FallthroughBB == nullptr) {
+ // This is an exit block, hence no successors. We will assign the
+ // bb select register to the entry block.
+ TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+ BBSelectReg,
+ CodeBB->getParent()->begin()->getNumber());
+ insertUnconditionalBranch(CodeBB, MergeBB, DL);
+ return;
+ }
+
+ if (FalseBB == nullptr && TrueBB == nullptr) {
+ TrueBB = FallthroughBB;
+ } else if (TrueBB != nullptr) {
+ FalseBB =
+ (FallthroughBB && (FallthroughBB != TrueBB)) ? FallthroughBB : FalseBB;
+ }
+
+ if ((TrueBB != nullptr && FalseBB == nullptr) || (TrueBB == FalseBB)) {
+ TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+ BBSelectReg, TrueBB->getNumber());
+ } else {
+ const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg);
+ unsigned TrueBBReg = MRI->createVirtualRegister(RegClass);
+ unsigned FalseBBReg = MRI->createVirtualRegister(RegClass);
+ TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+ TrueBBReg, TrueBB->getNumber());
+ TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+ FalseBBReg, FalseBB->getNumber());
+ ensureCondIsNotKilled(Cond);
+ TII->insertVectorSelect(*CodeBB, CodeBB->getFirstTerminator(), DL,
+ BBSelectReg, Cond, TrueBBReg, FalseBBReg);
+ }
+
+ insertUnconditionalBranch(CodeBB, MergeBB, DL);
+}
+
+MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) {
+ if (MRI->def_begin(Reg) == MRI->def_end()) {
+ DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ << " has NO defs\n");
+ } else if (!MRI->hasOneDef(Reg)) {
+ DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+ << " has multiple defs\n");
+ DEBUG(dbgs() << "DEFS BEGIN:\n");
+ for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) {
+ DEBUG(DI->getParent()->dump());
+ }
+ DEBUG(dbgs() << "DEFS END\n");
+ }
+
+ assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
+ return (*(MRI->def_begin(Reg))).getParent();
+}
+
+void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
+ MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ LinearizedRegion *InnerRegion,
+ unsigned DestReg,
+ unsigned SourceReg) {
+ // In this function we know we are part of a chain already, so we need
+ // to add the registers to the existing chain, and rename the register
+ // inside the region.
+ bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit();
+ MachineInstr *DefInstr = getDefInstr(SourceReg);
+ if (DefInstr->isPHI() && DefInstr->getParent() == CodeBB && IsSingleBB) {
+ // Handle the case where the def is a PHI-def inside a basic
+ // block, then we only need to do renaming. Special care needs to
+ // be taken if the PHI-def is part of an existing chain, or if a
+ // new one needs to be created.
+ InnerRegion->replaceRegisterInsideRegion(SourceReg, DestReg, true, MRI);
+
+ // We collect all PHI Information, and if we are at the region entry,
+ // all PHIs will be removed, and then re-introduced if needed.
+ storePHILinearizationInfoDest(DestReg, *DefInstr);
+ // We have picked up all the information we need now and can remove
+ // the PHI
+ PHIInfo.removeSource(DestReg, SourceReg, CodeBB);
+ DefInstr->eraseFromParent();
+ } else {
+ // If this is not a phi-def, or it is a phi-def but from a linearized region
+ if (IsSingleBB && DefInstr->getParent() == InnerRegion->getEntry()) {
+ // If this is a single BB and the definition is in this block we
+ // need to replace any uses outside the region.
+ InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI);
+ }
+ const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
+ unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
+ bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
+ DEBUG(dbgs() << "Insert Chained PHI\n");
+ insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
+ SourceReg, IsLastDef);
+
+ PHIInfo.removeSource(DestReg, SourceReg, CodeBB);
+ if (IsLastDef) {
+ const DebugLoc &DL = IfBB->findDebugLoc(IfBB->getFirstTerminator());
+ TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DL,
+ NextDestReg, 0);
+ PHIInfo.deleteDef(DestReg);
+ } else {
+ PHIInfo.replaceDef(DestReg, NextDestReg);
+ }
+ }
+}
+
+bool AMDGPUMachineCFGStructurizer::containsDef(MachineBasicBlock *MBB,
+ LinearizedRegion *InnerRegion,
+ unsigned Register) {
+ return getDefInstr(Register)->getParent() == MBB ||
+ InnerRegion->contains(getDefInstr(Register)->getParent());
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
+ MachineBasicBlock *CodeBB,
+ MachineBasicBlock *MergeBB,
+ LinearizedRegion *InnerRegion,
+ LinearizedRegion *LRegion) {
+ DenseSet<unsigned> *LiveOuts = InnerRegion->getLiveOuts();
+ SmallVector<unsigned, 4> OldLiveOuts;
+ bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit();
+ for (auto OLI : *LiveOuts) {
+ OldLiveOuts.push_back(OLI);
+ }
+
+ for (auto LI : OldLiveOuts) {
+ DEBUG(dbgs() << "LiveOut: " << PrintReg(LI, TRI));
+ if (!containsDef(CodeBB, InnerRegion, LI) ||
+ (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
+ // If the register simly lives through the CodeBB, we don't have
+ // to rewrite anything since the register is not defined in this
+ // part of the code.
+ DEBUG(dbgs() << "- through");
+ continue;
+ }
+ DEBUG(dbgs() << "\n");
+ unsigned Reg = LI;
+ if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) {
+ // If the register is live out, we do want to create a phi,
+ // unless it is from the Exit block, becasuse in that case there
+ // is already a PHI, and no need to create a new one.
+
+ // If the register is just a live out def and not part of a phi
+ // chain, we need to create a PHI node to handle the if region,
+ // and replace all uses outside of the region with the new dest
+ // register, unless it is the outgoing BB select register. We have
+ // already creaed phi nodes for these.
+ const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
+ unsigned PHIDestReg = MRI->createVirtualRegister(RegClass);
+ unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
+ // Create initializer, this value is never used, but is needed
+ // to satisfy SSA.
+ DEBUG(dbgs() << "Initializer for reg: " << PrintReg(Reg) << "\n");
+ TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(),
+ IfSourceReg, 0);
+
+ InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI);
+ DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
+ insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg,
+ IfSourceReg, Reg, true);
+ }
+ }
+
+ // Handle the chained definitions in PHIInfo, checking if this basic block
+ // is a source block for a definition.
+ SmallVector<unsigned, 4> Sources;
+ if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) {
+ DEBUG(dbgs() << "Inserting PHI Live Out from BB#" << CodeBB->getNumber()
+ << "\n");
+ for (auto SI : Sources) {
+ unsigned DestReg;
+ PHIInfo.findDest(SI, CodeBB, DestReg);
+ insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI);
+ }
+ DEBUG(dbgs() << "Insertion done.\n");
+ }
+
+ DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
+ DEBUG(dbgs() << "Before PHI Prune\n");
+ DEBUG(PHIInfo.dump(MRI));
+ SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4>
+ ElimiatedSources;
+ for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+ ++DRI) {
+
+ unsigned DestReg = *DRI;
+ auto SE = PHIInfo.sources_end(DestReg);
+
+ bool MBBContainsPHISource = false;
+ // Check if there is a PHI source in this MBB
+ for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+ unsigned SourceReg = (*SRI).first;
+ MachineOperand *Def = &(*(MRI->def_begin(SourceReg)));
+ if (Def->getParent()->getParent() == MBB) {
+ MBBContainsPHISource = true;
+ }
+ }
+
+ // If so, all other sources are useless since we know this block
+ // is always executed when the region is executed.
+ if (MBBContainsPHISource) {
+ for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+ PHILinearize::PHISourceT Source = *SRI;
+ unsigned SourceReg = Source.first;
+ MachineBasicBlock *SourceMBB = Source.second;
+ MachineOperand *Def = &(*(MRI->def_begin(SourceReg)));
+ if (Def->getParent()->getParent() != MBB) {
+ ElimiatedSources.push_back(
+ std::make_tuple(DestReg, SourceReg, SourceMBB));
+ }
+ }
+ }
+ }
+
+ // Remove the PHI sources that are in the given MBB
+ for (auto &SourceInfo : ElimiatedSources) {
+ PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo),
+ std::get<2>(SourceInfo));
+ }
+ DEBUG(dbgs() << "After PHI Prune\n");
+ DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion,
+ unsigned DestReg) {
+ MachineBasicBlock *Entry = CurrentRegion->getEntry();
+ MachineBasicBlock *Exit = CurrentRegion->getExit();
+
+ DEBUG(dbgs() << "RegionExit: " << Exit->getNumber()
+ << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n");
+
+ int NumSources = 0;
+ auto SE = PHIInfo.sources_end(DestReg);
+
+ for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+ NumSources++;
+ }
+
+ if (NumSources == 1) {
+ auto SRI = PHIInfo.sources_begin(DestReg);
+ unsigned SourceReg = (*SRI).first;
+ replaceRegisterWith(DestReg, SourceReg);
+ } else {
+ const DebugLoc &DL = Entry->findDebugLoc(Entry->begin());
+ MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL,
+ TII->get(TargetOpcode::PHI), DestReg);
+ DEBUG(dbgs() << "Entry PHI " << PrintReg(DestReg, TRI) << "<def> = PHI(");
+
+ unsigned CurrentBackedgeReg = 0;
+
+ for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+ unsigned SourceReg = (*SRI).first;
+
+ if (CurrentRegion->contains((*SRI).second)) {
+ if (CurrentBackedgeReg == 0) {
+ CurrentBackedgeReg = SourceReg;
+ } else {
+ MachineInstr *PHIDefInstr = getDefInstr(SourceReg);
+ MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent();
+ const TargetRegisterClass *RegClass =
+ MRI->getRegClass(CurrentBackedgeReg);
+ unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass);
+ MachineInstrBuilder BackedgePHI =
+ BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL,
+ TII->get(TargetOpcode::PHI), NewBackedgeReg);
+ BackedgePHI.addReg(CurrentBackedgeReg);
+ BackedgePHI.addMBB(getPHIPred(*PHIDefInstr, 0));
+ BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1));
+ BackedgePHI.addMBB((*SRI).second);
+ CurrentBackedgeReg = NewBackedgeReg;
+ DEBUG(dbgs() << "Inserting backedge PHI: "
+ << PrintReg(NewBackedgeReg, TRI) << "<def> = PHI("
+ << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
+ << getPHIPred(*PHIDefInstr, 0)->getNumber() << ", "
+ << PrintReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
+ << ", BB#" << (*SRI).second->getNumber());
+ }
+ } else {
+ MIB.addReg(SourceReg);
+ MIB.addMBB((*SRI).second);
+ DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+ << (*SRI).second->getNumber() << ", ");
+ }
+ }
+
+ // Add the final backedge register source to the entry phi
+ if (CurrentBackedgeReg != 0) {
+ MIB.addReg(CurrentBackedgeReg);
+ MIB.addMBB(Exit);
+ DEBUG(dbgs() << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
+ << Exit->getNumber() << ")\n");
+ } else {
+ DEBUG(dbgs() << ")\n");
+ }
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) {
+ DEBUG(PHIInfo.dump(MRI));
+
+ for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+ ++DRI) {
+
+ unsigned DestReg = *DRI;
+ createEntryPHI(CurrentRegion, DestReg);
+ }
+ PHIInfo.clear();
+}
+
+void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
+ unsigned NewRegister) {
+ assert(Register != NewRegister && "Cannot replace a reg with itself");
+
+ for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
+ E = MRI->reg_end();
+ I != E;) {
+ MachineOperand &O = *I;
+ ++I;
+ if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+ DEBUG(dbgs() << "Trying to substitute physical register: "
+ << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
+ llvm_unreachable("Cannot substitute physical registers");
+ // We don't handle physical registers, but if we need to
+ // in the future This is how we do it:
+ // O.substPhysReg(NewRegister, *TRI);
+ } else {
+ DEBUG(dbgs() << "Replacing register: "
+ << PrintReg(Register, MRI->getTargetRegisterInfo())
+ << " with "
+ << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+ << "\n");
+ O.setReg(NewRegister);
+ }
+ }
+ PHIInfo.deleteDef(Register);
+
+ getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
+
+ DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) {
+ DEBUG(dbgs() << "Resolve PHI Infos\n");
+ DEBUG(PHIInfo.dump(MRI));
+ for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+ ++DRI) {
+ unsigned DestReg = *DRI;
+ DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) << "\n");
+ auto SRI = PHIInfo.sources_begin(DestReg);
+ unsigned SourceReg = (*SRI).first;
+ DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI)
+ << " SourceReg: " << PrintReg(SourceReg, TRI) << "\n");
+
+ assert(PHIInfo.sources_end(DestReg) == ++SRI &&
+ "More than one phi source in entry node");
+ replaceRegisterWith(DestReg, SourceReg);
+ }
+}
+
+static bool isFunctionEntryBlock(MachineBasicBlock *MBB) {
+ return ((&(*(MBB->getParent()->begin()))) == MBB);
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
+ MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBB,
+ LinearizedRegion *CurrentRegion, unsigned BBSelectRegIn,
+ unsigned BBSelectRegOut) {
+ if (isFunctionEntryBlock(CodeBB) && !CurrentRegion->getHasLoop()) {
+ // Handle non-loop function entry block.
+ // We need to allow loops to the entry block and then
+ rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut);
+ resolvePHIInfos(CodeBB);
+ removeExternalCFGSuccessors(CodeBB);
+ CodeBB->addSuccessor(MergeBB);
+ CurrentRegion->addMBB(CodeBB);
+ return nullptr;
+ }
+ if (CurrentRegion->getEntry() == CodeBB && !CurrentRegion->getHasLoop()) {
+ // Handle non-loop region entry block.
+ MachineFunction *MF = MergeBB->getParent();
+ auto MergeIter = MergeBB->getIterator();
+ auto CodeBBStartIter = CodeBB->getIterator();
+ auto CodeBBEndIter = ++(CodeBB->getIterator());
+ if (CodeBBEndIter != MergeIter) {
+ MF->splice(MergeIter, CodeBBStartIter, CodeBBEndIter);
+ }
+ rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut);
+ prunePHIInfo(CodeBB);
+ createEntryPHIs(CurrentRegion);
+ removeExternalCFGSuccessors(CodeBB);
+ CodeBB->addSuccessor(MergeBB);
+ CurrentRegion->addMBB(CodeBB);
+ return nullptr;
+ } else {
+ // Handle internal block.
+ const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn);
+ unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
+ rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg);
+ bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB;
+ MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB,
+ BBSelectRegIn, IsRegionEntryBB);
+ CurrentRegion->addMBB(IfBB);
+ // If this is the entry block we need to make the If block the new
+ // linearized region entry.
+ if (IsRegionEntryBB) {
+ CurrentRegion->setEntry(IfBB);
+
+ if (CurrentRegion->getHasLoop()) {
+ MachineBasicBlock *RegionExit = CurrentRegion->getExit();
+ MachineBasicBlock *ETrueBB = nullptr;
+ MachineBasicBlock *EFalseBB = nullptr;
+ SmallVector<MachineOperand, 1> ECond;
+
+ const DebugLoc &DL = DebugLoc();
+ TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond);
+ TII->removeBranch(*RegionExit);
+
+ // We need to create a backedge if there is a loop
+ unsigned Reg = TII->insertNE(
+ RegionExit, RegionExit->instr_end(), DL,
+ CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
+ CurrentRegion->getRegionMRT()->getEntry()->getNumber());
+ MachineOperand RegOp =
+ MachineOperand::CreateReg(Reg, false, false, true);
+ ArrayRef<MachineOperand> Cond(RegOp);
+ DEBUG(dbgs() << "RegionExitReg: ");
+ DEBUG(Cond[0].print(dbgs(), TRI));
+ DEBUG(dbgs() << "\n");
+ TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
+ Cond, DebugLoc());
+ RegionExit->addSuccessor(CurrentRegion->getEntry());
+ }
+ }
+ CurrentRegion->addMBB(CodeBB);
+ LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo);
+
+ InnerRegion.setParent(CurrentRegion);
+ DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
+ insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
+ CodeBBSelectReg);
+ InnerRegion.addMBB(MergeBB);
+
+ DEBUG(InnerRegion.print(dbgs(), TRI));
+ rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion);
+ extractKilledPHIs(CodeBB);
+ if (IsRegionEntryBB) {
+ createEntryPHIs(CurrentRegion);
+ }
+ return IfBB;
+ }
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
+ MachineBasicBlock *MergeBB, LinearizedRegion *InnerRegion,
+ LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB,
+ unsigned BBSelectRegIn, unsigned BBSelectRegOut) {
+ unsigned CodeBBSelectReg =
+ InnerRegion->getRegionMRT()->getInnerOutputRegister();
+ MachineBasicBlock *CodeEntryBB = InnerRegion->getEntry();
+ MachineBasicBlock *CodeExitBB = InnerRegion->getExit();
+ MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeEntryBB, CodeExitBB,
+ SelectBB, BBSelectRegIn, true);
+ CurrentRegion->addMBB(IfBB);
+ bool isEntry = CurrentRegion->getEntry() == InnerRegion->getEntry();
+ if (isEntry) {
+
+ if (CurrentRegion->getHasLoop()) {
+ MachineBasicBlock *RegionExit = CurrentRegion->getExit();
+ MachineBasicBlock *ETrueBB = nullptr;
+ MachineBasicBlock *EFalseBB = nullptr;
+ SmallVector<MachineOperand, 1> ECond;
+
+ const DebugLoc &DL = DebugLoc();
+ TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond);
+ TII->removeBranch(*RegionExit);
+
+ // We need to create a backedge if there is a loop
+ unsigned Reg =
+ TII->insertNE(RegionExit, RegionExit->instr_end(), DL,
+ CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
+ CurrentRegion->getRegionMRT()->getEntry()->getNumber());
+ MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
+ ArrayRef<MachineOperand> Cond(RegOp);
+ DEBUG(dbgs() << "RegionExitReg: ");
+ DEBUG(Cond[0].print(dbgs(), TRI));
+ DEBUG(dbgs() << "\n");
+ TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
+ Cond, DebugLoc());
+ RegionExit->addSuccessor(IfBB);
+ }
+ }
+ CurrentRegion->addMBBs(InnerRegion);
+ DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
+ insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
+ CodeBBSelectReg);
+
+ rewriteLiveOutRegs(IfBB, /* CodeEntryBB */ CodeExitBB, MergeBB, InnerRegion,
+ CurrentRegion);
+
+ rewriteRegionEntryPHIs(InnerRegion, IfBB);
+
+ if (isEntry) {
+ CurrentRegion->setEntry(IfBB);
+ }
+
+ if (isEntry) {
+ createEntryPHIs(CurrentRegion);
+ }
+
+ return IfBB;
+}
+
+void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
+ MachineBasicBlock *Entry,
+ MachineBasicBlock *EntrySucc,
+ LinearizedRegion *LRegion) {
+ SmallVector<unsigned, 2> PHIRegionIndices;
+ getPHIRegionIndices(LRegion, PHI, PHIRegionIndices);
+
+ assert(PHIRegionIndices.size() == 1);
+
+ unsigned RegionIndex = PHIRegionIndices[0];
+ unsigned RegionSourceReg = getPHISourceReg(PHI, RegionIndex);
+ MachineBasicBlock *RegionSourceMBB = getPHIPred(PHI, RegionIndex);
+ unsigned PHIDest = getPHIDestReg(PHI);
+ unsigned PHISource = PHIDest;
+ unsigned ReplaceReg;
+
+ if (shrinkPHI(PHI, PHIRegionIndices, &ReplaceReg)) {
+ PHISource = ReplaceReg;
+ }
+
+ const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest);
+ unsigned NewDestReg = MRI->createVirtualRegister(RegClass);
+ LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI);
+ MachineInstrBuilder MIB =
+ BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
+ TII->get(TargetOpcode::PHI), NewDestReg);
+ DEBUG(dbgs() << "Split Entry PHI " << PrintReg(NewDestReg, TRI)
+ << "<def> = PHI(");
+ MIB.addReg(PHISource);
+ MIB.addMBB(Entry);
+ DEBUG(dbgs() << PrintReg(PHISource, TRI) << ", BB#" << Entry->getNumber());
+ MIB.addReg(RegionSourceReg);
+ MIB.addMBB(RegionSourceMBB);
+ DEBUG(dbgs() << " ," << PrintReg(RegionSourceReg, TRI) << ", BB#"
+ << RegionSourceMBB->getNumber() << ")\n");
+}
+
+void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry,
+ MachineBasicBlock *EntrySucc,
+ LinearizedRegion *LRegion) {
+ SmallVector<MachineInstr *, 2> PHIs;
+ collectPHIs(Entry, PHIs);
+
+ for (auto PHII : PHIs) {
+ splitLoopPHI(*PHII, Entry, EntrySucc, LRegion);
+ }
+}
+
+// Split the exit block so that we can insert a end control flow
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) {
+ auto MRTRegion = LRegion->getRegionMRT();
+ auto Exit = LRegion->getExit();
+ auto MF = Exit->getParent();
+ auto Succ = MRTRegion->getSucc();
+
+ auto NewExit = MF->CreateMachineBasicBlock();
+ auto AfterExitIter = Exit->getIterator();
+ AfterExitIter++;
+ MF->insert(AfterExitIter, NewExit);
+ Exit->removeSuccessor(Succ);
+ Exit->addSuccessor(NewExit);
+ NewExit->addSuccessor(Succ);
+ insertUnconditionalBranch(NewExit, Succ);
+ LRegion->addMBB(NewExit);
+ LRegion->setExit(NewExit);
+
+ DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n");
+
+ // Replace any PHI Predecessors in the successor with NewExit
+ for (auto &II : *Succ) {
+ MachineInstr &Instr = II;
+
+ // If we are past the PHI instructions we are done
+ if (!Instr.isPHI())
+ break;
+
+ int numPreds = getPHINumInputs(Instr);
+ for (int i = 0; i < numPreds; ++i) {
+ auto Pred = getPHIPred(Instr, i);
+ if (Pred == Exit) {
+ setPhiPred(Instr, i, NewExit);
+ }
+ }
+ }
+
+ return NewExit;
+}
+
+
+static MachineBasicBlock *split(MachineBasicBlock::iterator I) {
+ // Create the fall-through block.
+ MachineBasicBlock *MBB = (*I).getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock();
+ auto MBBIter = ++(MBB->getIterator());
+ MF->insert(MBBIter, SuccMBB);
+ SuccMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ MBB->addSuccessor(SuccMBB);
+
+ // Splice the code over.
+ SuccMBB->splice(SuccMBB->end(), MBB, I, MBB->end());
+
+ return SuccMBB;
+}
+
+// Split the entry block separating PHI-nodes and the rest of the code
+// This is needed to insert an initializer for the bb select register
+// inloop regions.
+
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) {
+ MachineBasicBlock *Entry = LRegion->getEntry();
+ MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI());
+ MachineBasicBlock *Exit = LRegion->getExit();
+
+ DEBUG(dbgs() << "Split BB#" << Entry->getNumber() << " to BB#"
+ << Entry->getNumber() << " -> BB#" << EntrySucc->getNumber()
+ << "\n");
+ LRegion->addMBB(EntrySucc);
+
+ // Make the backedge go to Entry Succ
+ if (Exit->isSuccessor(Entry)) {
+ Exit->removeSuccessor(Entry);
+ }
+ Exit->addSuccessor(EntrySucc);
+ MachineInstr &Branch = *(Exit->instr_rbegin());
+ for (auto &UI : Branch.uses()) {
+ if (UI.isMBB() && UI.getMBB() == Entry) {
+ UI.setMBB(EntrySucc);
+ }
+ }
+
+ splitLoopPHIs(Entry, EntrySucc, LRegion);
+
+ return EntrySucc;
+}
+
+LinearizedRegion *
+AMDGPUMachineCFGStructurizer::initLinearizedRegion(RegionMRT *Region) {
+ LinearizedRegion *LRegion = Region->getLinearizedRegion();
+ LRegion->initLiveOut(Region, MRI, TRI, PHIInfo);
+ LRegion->setEntry(Region->getEntry());
+ return LRegion;
+}
+
+static void removeOldExitPreds(RegionMRT *Region) {
+ MachineBasicBlock *Exit = Region->getSucc();
+ if (Exit == nullptr) {
+ return;
+ }
+ for (MachineBasicBlock::pred_iterator PI = Exit->pred_begin(),
+ E = Exit->pred_end();
+ PI != E; ++PI) {
+ if (Region->contains(*PI)) {
+ (*PI)->removeSuccessor(Exit);
+ }
+ }
+}
+
+static bool mbbHasBackEdge(MachineBasicBlock *MBB,
+ SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
+ for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+ if (MBBs.count(*SI) != 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool containsNewBackedge(MRT *Tree,
+ SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
+ // Need to traverse this in reverse since it is in post order.
+ if (Tree == nullptr)
+ return false;
+
+ if (Tree->isMBB()) {
+ MachineBasicBlock *MBB = Tree->getMBBMRT()->getMBB();
+ MBBs.insert(MBB);
+ if (mbbHasBackEdge(MBB, MBBs)) {
+ return true;
+ }
+ } else {
+ RegionMRT *Region = Tree->getRegionMRT();
+ SetVector<MRT *> *Children = Region->getChildren();
+ for (auto CI = Children->rbegin(), CE = Children->rend(); CI != CE; ++CI) {
+ if (containsNewBackedge(*CI, MBBs))
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool containsNewBackedge(RegionMRT *Region) {
+ SmallPtrSet<MachineBasicBlock *, 8> MBBs;
+ return containsNewBackedge(Region, MBBs);
+}
+
+bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
+ auto *LRegion = initLinearizedRegion(Region);
+ LRegion->setHasLoop(containsNewBackedge(Region));
+ MachineBasicBlock *LastMerge = createLinearizedExitBlock(Region);
+ MachineBasicBlock *CurrentMerge = LastMerge;
+ LRegion->addMBB(LastMerge);
+ LRegion->setExit(LastMerge);
+
+ rewriteRegionExitPHIs(Region, LastMerge, LRegion);
+ removeOldExitPreds(Region);
+
+ DEBUG(PHIInfo.dump(MRI));
+
+ SetVector<MRT *> *Children = Region->getChildren();
+ DEBUG(dbgs() << "===========If Region Start===============\n");
+ if (LRegion->getHasLoop()) {
+ DEBUG(dbgs() << "Has Backedge: Yes\n");
+ } else {
+ DEBUG(dbgs() << "Has Backedge: No\n");
+ }
+
+ unsigned BBSelectRegIn;
+ unsigned BBSelectRegOut;
+ for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) {
+ DEBUG(dbgs() << "CurrentRegion: \n");
+ DEBUG(LRegion->print(dbgs(), TRI));
+
+ auto CNI = CI;
+ ++CNI;
+
+ MRT *Child = (*CI);
+
+ if (Child->isRegion()) {
+
+ LinearizedRegion *InnerLRegion =
+ Child->getRegionMRT()->getLinearizedRegion();
+ // We found the block is the exit of an inner region, we need
+ // to put it in the current linearized region.
+
+ DEBUG(dbgs() << "Linearizing region: ");
+ DEBUG(InnerLRegion->print(dbgs(), TRI));
+ DEBUG(dbgs() << "\n");
+
+ MachineBasicBlock *InnerEntry = InnerLRegion->getEntry();
+ if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) {
+ // Entry has already been linearized, no need to do this region.
+ unsigned OuterSelect = InnerLRegion->getBBSelectRegOut();
+ unsigned InnerSelectReg =
+ InnerLRegion->getRegionMRT()->getInnerOutputRegister();
+ replaceRegisterWith(InnerSelectReg, OuterSelect),
+ resolvePHIInfos(InnerEntry);
+ if (!InnerLRegion->getExit()->isSuccessor(CurrentMerge))
+ InnerLRegion->getExit()->addSuccessor(CurrentMerge);
+ continue;
+ }
+
+ BBSelectRegOut = Child->getBBSelectRegOut();
+ BBSelectRegIn = Child->getBBSelectRegIn();
+
+ DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+ << "\n");
+ DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+ << "\n");
+
+ MachineBasicBlock *IfEnd = CurrentMerge;
+ CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion,
+ Child->getRegionMRT()->getEntry(),
+ BBSelectRegIn, BBSelectRegOut);
+ TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
+ } else {
+ MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB();
+ DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
+
+ if (MBB == getSingleExitNode(*(MBB->getParent()))) {
+ // If this is the exit block then we need to skip to the next.
+ // The "in" register will be transferred to "out" in the next
+ // iteration.
+ continue;
+ }
+
+ BBSelectRegOut = Child->getBBSelectRegOut();
+ BBSelectRegIn = Child->getBBSelectRegIn();
+
+ DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+ << "\n");
+ DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+ << "\n");
+
+ MachineBasicBlock *IfEnd = CurrentMerge;
+ // This is a basic block that is not part of an inner region, we
+ // need to put it in the current linearized region.
+ CurrentMerge = createIfRegion(CurrentMerge, MBB, LRegion, BBSelectRegIn,
+ BBSelectRegOut);
+ if (CurrentMerge) {
+ TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
+ }
+
+ DEBUG(PHIInfo.dump(MRI));
+ }
+ }
+
+ LRegion->removeFalseRegisterKills(MRI);
+
+ if (LRegion->getHasLoop()) {
+ MachineBasicBlock *NewSucc = splitEntry(LRegion);
+ if (isFunctionEntryBlock(LRegion->getEntry())) {
+ resolvePHIInfos(LRegion->getEntry());
+ }
+ const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI());
+ unsigned InReg = LRegion->getBBSelectRegIn();
+ unsigned InnerSelectReg =
+ MRI->createVirtualRegister(MRI->getRegClass(InReg));
+ unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
+ TII->materializeImmediate(*(LRegion->getEntry()),
+ LRegion->getEntry()->getFirstTerminator(), DL,
+ NewInReg, Region->getEntry()->getNumber());
+ // Need to be careful about updating the registers inside the region.
+ LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI);
+ DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
+ insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc,
+ InnerSelectReg, NewInReg,
+ LRegion->getRegionMRT()->getInnerOutputRegister());
+ splitExit(LRegion);
+ TII->convertNonUniformLoopRegion(NewSucc, LastMerge);
+ }
+
+ if (Region->isRoot()) {
+ TII->insertReturn(*LastMerge);
+ }
+
+ DEBUG(Region->getEntry()->getParent()->dump());
+ DEBUG(LRegion->print(dbgs(), TRI));
+ DEBUG(PHIInfo.dump(MRI));
+
+ DEBUG(dbgs() << "===========If Region End===============\n");
+
+ Region->setLinearizedRegion(LRegion);
+ return true;
+}
+
+bool AMDGPUMachineCFGStructurizer::structurizeRegion(RegionMRT *Region) {
+ if (false && regionIsSimpleIf(Region)) {
+ transformSimpleIfRegion(Region);
+ return true;
+ } else if (regionIsSequence(Region)) {
+ fixupRegionExits(Region);
+ return false;
+ } else {
+ structurizeComplexRegion(Region);
+ }
+ return false;
+}
+
+static int structurize_once = 0;
+
+bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region,
+ bool isTopRegion) {
+ bool Changed = false;
+
+ auto Children = Region->getChildren();
+ for (auto CI : *Children) {
+ if (CI->isRegion()) {
+ Changed |= structurizeRegions(CI->getRegionMRT(), false);
+ }
+ }
+
+ if (structurize_once < 2 || true) {
+ Changed |= structurizeRegion(Region);
+ structurize_once++;
+ }
+ return Changed;
+}
+
+void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) {
+ DEBUG(dbgs() << "Fallthrough Map:\n");
+ for (auto &MBBI : MF) {
+ MachineBasicBlock *MBB = MBBI.getFallThrough();
+ if (MBB != nullptr) {
+ DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
+ << MBB->getNumber() << "\n");
+ }
+ FallthroughMap[&MBBI] = MBB;
+ }
+}
+
+void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region,
+ unsigned SelectOut) {
+ LinearizedRegion *LRegion = new LinearizedRegion();
+ if (SelectOut) {
+ LRegion->addLiveOut(SelectOut);
+ DEBUG(dbgs() << "Add LiveOut (BBSelect): " << PrintReg(SelectOut, TRI)
+ << "\n");
+ }
+ LRegion->setRegionMRT(Region);
+ Region->setLinearizedRegion(LRegion);
+ LRegion->setParent(Region->getParent()
+ ? Region->getParent()->getLinearizedRegion()
+ : nullptr);
+}
+
+unsigned
+AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned SelectOut,
+ MachineRegisterInfo *MRI,
+ const SIInstrInfo *TII) {
+ if (MRT->isRegion()) {
+ RegionMRT *Region = MRT->getRegionMRT();
+ Region->setBBSelectRegOut(SelectOut);
+ unsigned InnerSelectOut = createBBSelectReg(TII, MRI);
+
+ // Fixme: Move linearization creation to the original spot
+ createLinearizedRegion(Region, SelectOut);
+
+ for (auto CI = Region->getChildren()->begin(),
+ CE = Region->getChildren()->end();
+ CI != CE; ++CI) {
+ InnerSelectOut =
+ initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII);
+ }
+ MRT->setBBSelectRegIn(InnerSelectOut);
+ return InnerSelectOut;
+ } else {
+ MRT->setBBSelectRegOut(SelectOut);
+ unsigned NewSelectIn = createBBSelectReg(TII, MRI);
+ MRT->setBBSelectRegIn(NewSelectIn);
+ return NewSelectIn;
+ }
+}
+
+static void checkRegOnlyPHIInputs(MachineFunction &MF) {
+ for (auto &MBBI : MF) {
+ for (MachineBasicBlock::instr_iterator I = MBBI.instr_begin(),
+ E = MBBI.instr_end();
+ I != E; ++I) {
+ MachineInstr &Instr = *I;
+ if (Instr.isPHI()) {
+ int numPreds = getPHINumInputs(Instr);
+ for (int i = 0; i < numPreds; ++i) {
+ assert(Instr.getOperand(i * 2 + 1).isReg() &&
+ "PHI Operand not a register");
+ }
+ }
+ }
+ }
+}
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+ "AMDGPU Machine CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass)
+INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+ "AMDGPU Machine CFG Structurizer", false, false)
+
+char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID;
+
+
+bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+ MRI = &(MF.getRegInfo());
+ initFallthroughMap(MF);
+
+ checkRegOnlyPHIInputs(MF);
+ DEBUG(dbgs() << "----STRUCTURIZER START----\n");
+ DEBUG(MF.dump());
+
+ Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo());
+ DEBUG(Regions->dump());
+
+ RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI);
+ setRegionMRT(RTree);
+ initializeSelectRegisters(RTree, 0, MRI, TII);
+ DEBUG(RTree->dump(TRI));
+ bool result = structurizeRegions(RTree, true);
+ delete RTree;
+ DEBUG(dbgs() << "----STRUCTURIZER END----\n");
+ initFallthroughMap(MF);
+ return result;
+}
+
+FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() {
+ return new AMDGPUMachineCFGStructurizer();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 36dcc699d4ea..e40f39557747 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -397,14 +397,17 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
// instructions.
static bool canVectorizeInst(Instruction *Inst, User *User) {
switch (Inst->getOpcode()) {
- case Instruction::Load:
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(Inst);
+ return !LI->isVolatile();
+ }
case Instruction::BitCast:
case Instruction::AddrSpaceCast:
return true;
case Instruction::Store: {
// Must be the stored pointer operand, not a stored value.
StoreInst *SI = cast<StoreInst>(Inst);
- return SI->getPointerOperand() == User;
+ return (SI->getPointerOperand() == User) && !SI->isVolatile();
}
default:
return false;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 972c28579f7a..6e301b4ad527 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -125,6 +125,9 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasSDWA(false),
HasDPP(false),
FlatAddressSpace(false),
+ FlatInstOffsets(false),
+ FlatGlobalInsts(false),
+ FlatScratchInsts(false),
R600ALUInst(false),
CaymanISA(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index a5cda817ac11..bed7d326b3dd 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -145,6 +145,9 @@ protected:
bool HasSDWA;
bool HasDPP;
bool FlatAddressSpace;
+ bool FlatInstOffsets;
+ bool FlatGlobalInsts;
+ bool FlatScratchInsts;
bool R600ALUInst;
bool CaymanISA;
bool CFALUBug;
@@ -380,6 +383,18 @@ public:
return FlatAddressSpace;
}
+ bool hasFlatInstOffsets() const {
+ return FlatInstOffsets;
+ }
+
+ bool hasFlatGlobalInsts() const {
+ return FlatGlobalInsts;
+ }
+
+ bool hasFlatScratchInsts() const {
+ return FlatScratchInsts;
+ }
+
bool isMesaKernel(const MachineFunction &MF) const {
return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index cd5bad04d0b3..386a88b0520f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -118,6 +118,13 @@ static cl::opt<bool> EnableSIInsertWaitcntsPass(
cl::desc("Use new waitcnt insertion pass"),
cl::init(false));
+// Option to run late CFG structurizer
+static cl::opt<bool> LateCFGStructurize(
+ "amdgpu-late-structurize",
+ cl::desc("Enable late CFG structurization"),
+ cl::init(false),
+ cl::Hidden);
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -702,11 +709,15 @@ bool GCNPassConfig::addPreISel() {
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
addPass(&AMDGPUUnifyDivergentExitNodesID);
- addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+ if (!LateCFGStructurize) {
+ addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+ }
addPass(createSinkingPass());
addPass(createSITypeRewriter());
addPass(createAMDGPUAnnotateUniformValues());
- addPass(createSIAnnotateControlFlowPass());
+ if (!LateCFGStructurize) {
+ addPass(createSIAnnotateControlFlowPass());
+ }
return false;
}
@@ -770,6 +781,9 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
#endif
void GCNPassConfig::addPreRegAlloc() {
+ if (LateCFGStructurize) {
+ addPass(createAMDGPUMachineCFGStructurizerPass());
+ }
addPass(createSIWholeQuadModePass());
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c9482c37ec80..beafebc1284a 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -363,13 +363,22 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
unsigned Index) {
switch (Opcode) {
case Instruction::ExtractElement:
- case Instruction::InsertElement:
+ case Instruction::InsertElement: {
+ unsigned EltSize
+ = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
+ if (EltSize < 32) {
+ if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
+ return 0;
+ return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+ }
+
// Extracts are just reads of a subregister, so are free. Inserts are
// considered free because we don't want to have any cost for scalarizing
// operations, and we don't have to copy into a different register class.
// Dynamic indexing isn't free and is best avoided.
return Index == ~0u ? 2 : 0;
+ }
default:
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
@@ -479,3 +488,26 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
return false;
}
+
+unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) {
+ if (ST->hasVOP3PInsts()) {
+ VectorType *VT = cast<VectorType>(Tp);
+ if (VT->getNumElements() == 2 &&
+ DL.getTypeSizeInBits(VT->getElementType()) == 16) {
+ // With op_sel VOP3P instructions freely can access the low half or high
+ // half of a register, so any swizzle is free.
+
+ switch (Kind) {
+ case TTI::SK_Broadcast:
+ case TTI::SK_Reverse:
+ case TTI::SK_PermuteSingleSrc:
+ return 0;
+ default:
+ break;
+ }
+ }
+ }
+
+ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 71d6306bc1a5..e0024e21e82b 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -114,6 +114,9 @@ public:
}
unsigned getVectorSplitCost() { return 0; }
+
+ unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp);
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 7c0ef4aeac3c..cafce0164fa9 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -48,6 +48,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUISelDAGToDAG.cpp
AMDGPULowerIntrinsics.cpp
AMDGPUMCInstLower.cpp
+ AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp
AMDGPUUnifyMetadata.cpp
AMDGPUOpenCLImageTypeLoweringPass.cpp
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index b0ac0e689a0b..8ba9efd42c70 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -7,7 +7,7 @@
//
//===----------------------------------------------------------------------===//
-def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
+def FLATAtomic : ComplexPattern<i64, 2, "SelectFlat">;
//===----------------------------------------------------------------------===//
// FLAT classes
@@ -62,7 +62,9 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
bits<8> vdst;
bits<1> slc;
bits<1> glc;
- bits<1> tfe;
+
+ // We don't use tfe right now, and it was removed in gfx9.
+ bits<1> tfe = 0;
// 15-0 is reserved.
let Inst{16} = !if(ps.has_glc, glc, ps.glcValue);
@@ -79,8 +81,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
opName,
(outs regClass:$vdst),
- (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe),
- " $vdst, $vaddr$glc$slc$tfe"> {
+ (ins VReg_64:$vaddr, GLC:$glc, slc:$slc),
+ " $vdst, $vaddr$glc$slc"> {
let has_data = 0;
let mayLoad = 1;
}
@@ -88,8 +90,8 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo<
opName,
(outs),
- (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe),
- " $vaddr, $vdata$glc$slc$tfe"> {
+ (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc),
+ " $vaddr, $vdata$glc$slc"> {
let mayLoad = 0;
let mayStore = 1;
let has_vdst = 0;
@@ -105,8 +107,8 @@ multiclass FLAT_Atomic_Pseudo<
def "" : FLAT_Pseudo <opName,
(outs),
- (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
- " $vaddr, $vdata$slc$tfe",
+ (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc),
+ " $vaddr, $vdata$slc",
[]>,
AtomicNoRet <NAME, 0> {
let mayLoad = 1;
@@ -119,10 +121,10 @@ multiclass FLAT_Atomic_Pseudo<
def _RTN : FLAT_Pseudo <opName,
(outs vdst_rc:$vdst),
- (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
- " $vdst, $vaddr, $vdata glc$slc$tfe",
+ (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc),
+ " $vdst, $vaddr, $vdata glc$slc",
[(set vt:$vdst,
- (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>,
+ (atomic (FLATAtomic i64:$vaddr, i1:$slc), data_vt:$vdata))]>,
AtomicNoRet <NAME, 1> {
let mayLoad = 1;
let mayStore = 1;
@@ -311,30 +313,30 @@ def flat_truncstorei16 : flat_st <truncstorei16>;
// Patterns for global loads with no offset.
class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
(vt (node i64:$addr)),
- (inst $addr, 0, 0, 0)
+ (inst $addr, 0, 0)
>;
class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
(vt (node i64:$addr)),
- (inst $addr, 1, 0, 0)
+ (inst $addr, 1, 0)
>;
class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
(node vt:$data, i64:$addr),
- (inst $addr, $data, 0, 0, 0)
+ (inst $addr, $data, 0, 0)
>;
class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
// atomic store follows atomic binop convention so the address comes
// first.
(node i64:$addr, vt:$data),
- (inst $addr, $data, 1, 0, 0)
+ (inst $addr, $data, 1, 0)
>;
class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
ValueType data_vt = vt> : Pat <
(vt (node i64:$addr, data_vt:$data)),
- (inst $addr, $data, 0, 0)
+ (inst $addr, $data, 0)
>;
let Predicates = [isCIVI] in {
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index bf16a8216001..8066428fe44a 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -27,7 +27,7 @@ void llvm::printLivesAt(SlotIndex SI,
unsigned Num = 0;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
- if (MRI.reg_nodbg_empty(Reg))
+ if (!LIS.hasInterval(Reg))
continue;
const auto &LI = LIS.getInterval(Reg);
if (LI.hasSubRanges()) {
@@ -131,13 +131,13 @@ bool GCNRegPressure::less(const SISubtarget &ST,
const GCNRegPressure& O,
unsigned MaxOccupancy) const {
const auto SGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumSGPRs(getSGRPNum()));
+ ST.getOccupancyWithNumSGPRs(getSGPRNum()));
const auto VGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+ ST.getOccupancyWithNumVGPRs(getVGPRNum()));
const auto OtherSGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumSGPRs(O.getSGRPNum()));
+ ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
const auto OtherVGPROcc = std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(O.getVGRPNum()));
+ ST.getOccupancyWithNumVGPRs(O.getVGPRNum()));
const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -167,17 +167,17 @@ bool GCNRegPressure::less(const SISubtarget &ST,
return VW < OtherVW;
}
}
- return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()):
- (getVGRPNum() < O.getVGRPNum());
+ return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
+ (getVGPRNum() < O.getVGPRNum());
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
- OS << "VGPRs: " << getVGRPNum();
- if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')';
- OS << ", SGPRs: " << getSGRPNum();
- if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')';
+ OS << "VGPRs: " << getVGPRNum();
+ if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
+ OS << ", SGPRs: " << getSGPRNum();
+ if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
OS << ", LVGPR WT: " << getVGPRTuplesWeight()
<< ", LSGPR WT: " << getSGPRTuplesWeight();
if (ST) OS << " -> Occ: " << getOccupancy(*ST);
@@ -192,7 +192,6 @@ LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI) {
- assert(!MRI.reg_nodbg_empty(Reg));
LaneBitmask LiveMask;
const auto &LI = LIS.getInterval(Reg);
if (LI.hasSubRanges()) {
@@ -214,7 +213,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
GCNRPTracker::LiveRegSet LiveRegs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
auto Reg = TargetRegisterInfo::index2VirtReg(I);
- if (MRI.reg_nodbg_empty(Reg))
+ if (!LIS.hasInterval(Reg))
continue;
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
if (LiveMask.any())
@@ -223,13 +222,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
return LiveRegs;
}
-void GCNUpwardRPTracker::reset(const MachineInstr &MI) {
- MRI = &MI.getParent()->getParent()->getRegInfo();
- LiveRegs = getLiveRegsAfter(MI, LIS);
- MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
-}
-
-LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
+LaneBitmask GCNRPTracker::getDefRegMask(const MachineOperand &MO) const {
assert(MO.isDef() && MO.isReg() &&
TargetRegisterInfo::isVirtualRegister(MO.getReg()));
@@ -241,7 +234,7 @@ LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
}
-LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
+LaneBitmask GCNRPTracker::getUsedRegMask(const MachineOperand &MO) const {
assert(MO.isUse() && MO.isReg() &&
TargetRegisterInfo::isVirtualRegister(MO.getReg()));
@@ -259,6 +252,18 @@ LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
}
+void GCNUpwardRPTracker::reset(const MachineInstr &MI,
+ const LiveRegSet *LiveRegsCopy) {
+ MRI = &MI.getParent()->getParent()->getRegInfo();
+ if (LiveRegsCopy) {
+ if (&LiveRegs != LiveRegsCopy)
+ LiveRegs = *LiveRegsCopy;
+ } else {
+ LiveRegs = getLiveRegsAfter(MI, LIS);
+ }
+ MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+}
+
void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
assert(MRI && "call reset first");
@@ -297,6 +302,100 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
MaxPressure = max(MaxPressure, CurPressure);
}
+bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
+ const LiveRegSet *LiveRegsCopy) {
+ MRI = &MI.getParent()->getParent()->getRegInfo();
+ LastTrackedMI = nullptr;
+ MBBEnd = MI.getParent()->end();
+ NextMI = &MI;
+ NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+ if (NextMI == MBBEnd)
+ return false;
+ if (LiveRegsCopy) {
+ if (&LiveRegs != LiveRegsCopy)
+ LiveRegs = *LiveRegsCopy;
+ } else {
+ LiveRegs = getLiveRegsBefore(*NextMI, LIS);
+ }
+ MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+ return true;
+}
+
+bool GCNDownwardRPTracker::advanceBeforeNext() {
+ assert(MRI && "call reset first");
+
+ NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+ if (NextMI == MBBEnd)
+ return false;
+
+ SlotIndex SI = LIS.getInstructionIndex(*NextMI).getBaseIndex();
+ assert(SI.isValid());
+
+ // Remove dead registers or mask bits.
+ for (auto &It : LiveRegs) {
+ const LiveInterval &LI = LIS.getInterval(It.first);
+ if (LI.hasSubRanges()) {
+ for (const auto &S : LI.subranges()) {
+ if (!S.liveAt(SI)) {
+ auto PrevMask = It.second;
+ It.second &= ~S.LaneMask;
+ CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+ }
+ }
+ } else if (!LI.liveAt(SI)) {
+ auto PrevMask = It.second;
+ It.second = LaneBitmask::getNone();
+ CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+ }
+ if (It.second.none())
+ LiveRegs.erase(It.first);
+ }
+
+ MaxPressure = max(MaxPressure, CurPressure);
+
+ return true;
+}
+
+void GCNDownwardRPTracker::advanceToNext() {
+ LastTrackedMI = &*NextMI++;
+
+ // Add new registers or mask bits.
+ for (const auto &MO : LastTrackedMI->defs()) {
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ auto &LiveMask = LiveRegs[Reg];
+ auto PrevMask = LiveMask;
+ LiveMask |= getDefRegMask(MO);
+ CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+ }
+
+ MaxPressure = max(MaxPressure, CurPressure);
+}
+
+bool GCNDownwardRPTracker::advance() {
+ // If we have just called reset live set is actual.
+ if ((NextMI == MBBEnd) || (LastTrackedMI && !advanceBeforeNext()))
+ return false;
+ advanceToNext();
+ return true;
+}
+
+bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator End) {
+ while (NextMI != End)
+ if (!advance()) return false;
+ return true;
+}
+
+bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin,
+ MachineBasicBlock::const_iterator End,
+ const LiveRegSet *LiveRegsCopy) {
+ reset(*Begin, LiveRegsCopy);
+ return advance(End);
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
@@ -352,4 +451,16 @@ bool GCNUpwardRPTracker::isValid() const {
return true;
}
+void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
+ const MachineRegisterInfo &MRI) {
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+ auto It = LiveRegs.find(Reg);
+ if (It != LiveRegs.end() && It->second.any())
+ OS << ' ' << PrintVRegOrUnit(Reg, TRI) << ':'
+ << PrintLaneMask(It->second);
+ }
+ OS << '\n';
+}
#endif
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index 82e76a7bfddc..9875ca6a6d16 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -33,19 +33,19 @@ struct GCNRegPressure {
clear();
}
- bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; }
+ bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; }
void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
- unsigned getSGRPNum() const { return Value[SGPR32]; }
- unsigned getVGRPNum() const { return Value[VGPR32]; }
+ unsigned getSGPRNum() const { return Value[SGPR32]; }
+ unsigned getVGPRNum() const { return Value[VGPR32]; }
unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
unsigned getOccupancy(const SISubtarget &ST) const {
- return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()),
- ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+ return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+ ST.getOccupancyWithNumVGPRs(getVGPRNum()));
}
void inc(unsigned Reg,
@@ -92,16 +92,21 @@ public:
typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
protected:
+ const LiveIntervals &LIS;
LiveRegSet LiveRegs;
GCNRegPressure CurPressure, MaxPressure;
const MachineInstr *LastTrackedMI = nullptr;
mutable const MachineRegisterInfo *MRI = nullptr;
- GCNRPTracker() {}
+ GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+ LaneBitmask getDefRegMask(const MachineOperand &MO) const;
+ LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
public:
// live regs for the current state
const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
+ void clearMaxPressure() { MaxPressure.clear(); }
+
// returns MaxPressure, resetting it
decltype(MaxPressure) moveMaxPressure() {
auto Res = MaxPressure;
@@ -111,17 +116,16 @@ public:
decltype(LiveRegs) moveLiveRegs() {
return std::move(LiveRegs);
}
+ static void printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
+ const MachineRegisterInfo &MRI);
};
class GCNUpwardRPTracker : public GCNRPTracker {
- const LiveIntervals &LIS;
- LaneBitmask getDefRegMask(const MachineOperand &MO) const;
- LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
public:
- GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+ GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
// reset tracker to the point just below MI
// filling live regs upon this point using LIS
- void reset(const MachineInstr &MI);
+ void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
// move to the state just above the MI
void recede(const MachineInstr &MI);
@@ -131,6 +135,41 @@ public:
bool isValid() const;
};
+class GCNDownwardRPTracker : public GCNRPTracker {
+ // Last position of reset or advanceBeforeNext
+ MachineBasicBlock::const_iterator NextMI;
+
+ MachineBasicBlock::const_iterator MBBEnd;
+
+public:
+ GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
+
+ const MachineBasicBlock::const_iterator getNext() const { return NextMI; }
+
+ // Reset tracker to the point before the MI
+ // filling live regs upon this point using LIS.
+ // Returns false if block is empty except debug values.
+ bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
+
+ // Move to the state right before the next MI. Returns false if reached
+ // end of the block.
+ bool advanceBeforeNext();
+
+ // Move to the state at the MI, advanceBeforeNext has to be called first.
+ void advanceToNext();
+
+ // Move to the state at the next MI. Returns false if reached end of block.
+ bool advance();
+
+ // Advance instructions until before End.
+ bool advance(MachineBasicBlock::const_iterator End);
+
+ // Reset to Begin and advance to End.
+ bool advance(MachineBasicBlock::const_iterator Begin,
+ MachineBasicBlock::const_iterator End,
+ const LiveRegSet *LiveRegsCopy = nullptr);
+};
+
LaneBitmask getLiveLaneMask(unsigned Reg,
SlotIndex SI,
const LiveIntervals &LIS,
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 630442625aa3..8ec46665daf5 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -316,46 +316,57 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
*MF.getFunction())),
- MinOccupancy(StartingOccupancy), Stage(0) {
+ MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}
void GCNScheduleDAGMILive::schedule() {
+ if (Stage == 0) {
+ // Just record regions at the first pass.
+ Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+ return;
+ }
+
std::vector<MachineInstr*> Unsched;
Unsched.reserve(NumRegionInstrs);
for (auto &I : *this)
Unsched.push_back(&I);
- std::pair<unsigned, unsigned> PressureBefore;
+ GCNRegPressure PressureBefore;
if (LIS) {
- DEBUG(dbgs() << "Pressure before scheduling:\n");
- discoverLiveIns();
- PressureBefore = getRealRegPressure();
+ PressureBefore = Pressure[RegionIdx];
+
+ DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+ GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
+ dbgs() << "Region live-in pressure: ";
+ llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
+ dbgs() << "Region register pressure: ";
+ PressureBefore.print(dbgs()));
}
ScheduleDAGMILive::schedule();
- if (Stage == 0)
- Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+ Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
if (!LIS)
return;
// Check the results of scheduling.
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
- DEBUG(dbgs() << "Pressure after scheduling:\n");
auto PressureAfter = getRealRegPressure();
- LiveIns.clear();
- if (PressureAfter.first <= S.SGPRCriticalLimit &&
- PressureAfter.second <= S.VGPRCriticalLimit) {
+ DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs()));
+
+ if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
+ PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
+ Pressure[RegionIdx] = PressureAfter;
DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
}
- unsigned WavesAfter = getMaxWaves(PressureAfter.first,
- PressureAfter.second, MF);
- unsigned WavesBefore = getMaxWaves(PressureBefore.first,
- PressureBefore.second, MF);
+ unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(),
+ PressureAfter.getVGPRNum(), MF);
+ unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(),
+ PressureBefore.getVGPRNum(), MF);
DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
", after " << WavesAfter << ".\n");
@@ -368,8 +379,10 @@ void GCNScheduleDAGMILive::schedule() {
<< MinOccupancy << ".\n");
}
- if (WavesAfter >= WavesBefore)
+ if (WavesAfter >= WavesBefore) {
+ Pressure[RegionIdx] = PressureAfter;
return;
+ }
DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RegionEnd = RegionBegin;
@@ -398,166 +411,139 @@ void GCNScheduleDAGMILive::schedule() {
DEBUG(dbgs() << "Scheduling " << *MI);
}
RegionBegin = Unsched.front()->getIterator();
- if (Stage == 0)
- Regions.back() = std::make_pair(RegionBegin, RegionEnd);
+ Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
placeDebugValues();
}
-static inline void setMask(const MachineRegisterInfo &MRI,
- const SIRegisterInfo *SRI, unsigned Reg,
- LaneBitmask &PrevMask, LaneBitmask NewMask,
- unsigned &SGPRs, unsigned &VGPRs) {
- int NewRegs = countPopulation(NewMask.getAsInteger()) -
- countPopulation(PrevMask.getAsInteger());
- if (SRI->isSGPRReg(MRI, Reg))
- SGPRs += NewRegs;
- if (SRI->isVGPR(MRI, Reg))
- VGPRs += NewRegs;
- assert ((int)SGPRs >= 0 && (int)VGPRs >= 0);
- PrevMask = NewMask;
+GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
+ GCNDownwardRPTracker RPTracker(*LIS);
+ RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+ return RPTracker.moveMaxPressure();
}
-void GCNScheduleDAGMILive::discoverLiveIns() {
- unsigned SGPRs = 0;
- unsigned VGPRs = 0;
+void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
+ GCNDownwardRPTracker RPTracker(*LIS);
+
+ // If the block has the only successor then live-ins of that successor are
+ // live-outs of the current block. We can reuse calculated live set if the
+ // successor will be sent to scheduling past current block.
+ const MachineBasicBlock *OnlySucc = nullptr;
+ if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) {
+ SlotIndexes *Ind = LIS->getSlotIndexes();
+ if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin()))
+ OnlySucc = *MBB->succ_begin();
+ }
- auto &MI = *begin()->getParent()->getFirstNonDebugInstr();
- const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
- SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
- assert (SI.isValid());
-
- DEBUG(dbgs() << "Region live-ins:");
- for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
- if (MRI.reg_nodbg_empty(Reg))
- continue;
- const LiveInterval &LI = LIS->getInterval(Reg);
- LaneBitmask LaneMask = LaneBitmask::getNone();
- if (LI.hasSubRanges()) {
- for (const auto &S : LI.subranges())
- if (S.liveAt(SI))
- LaneMask |= S.LaneMask;
- } else if (LI.liveAt(SI)) {
- LaneMask = MRI.getMaxLaneMaskForVReg(Reg);
- }
+ // Scheduler sends regions from the end of the block upwards.
+ size_t CurRegion = RegionIdx;
+ for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
+ if (Regions[CurRegion].first->getParent() != MBB)
+ break;
+ --CurRegion;
+
+ auto I = MBB->begin();
+ auto LiveInIt = MBBLiveIns.find(MBB);
+ if (LiveInIt != MBBLiveIns.end()) {
+ auto LiveIn = std::move(LiveInIt->second);
+ RPTracker.reset(*MBB->begin(), &LiveIn);
+ MBBLiveIns.erase(LiveInIt);
+ } else {
+ I = Regions[CurRegion].first;
+ RPTracker.reset(*I);
+ }
- if (LaneMask.any()) {
- setMask(MRI, SRI, Reg, LiveIns[Reg], LaneMask, SGPRs, VGPRs);
+ for ( ; ; ) {
+ I = RPTracker.getNext();
- DEBUG(dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':'
- << PrintLaneMask(LiveIns[Reg]));
+ if (Regions[CurRegion].first == I) {
+ LiveIns[CurRegion] = RPTracker.getLiveRegs();
+ RPTracker.clearMaxPressure();
}
- }
- LiveInPressure = std::make_pair(SGPRs, VGPRs);
+ if (Regions[CurRegion].second == I) {
+ Pressure[CurRegion] = RPTracker.moveMaxPressure();
+ if (CurRegion-- == RegionIdx)
+ break;
+ }
+ RPTracker.advanceToNext();
+ RPTracker.advanceBeforeNext();
+ }
- DEBUG(dbgs() << "\nLive-in pressure:\nSGPR = " << SGPRs
- << "\nVGPR = " << VGPRs << '\n');
+ if (OnlySucc) {
+ if (I != MBB->end()) {
+ RPTracker.advanceToNext();
+ RPTracker.advance(MBB->end());
+ }
+ RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs());
+ RPTracker.advanceBeforeNext();
+ MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
+ }
}
-std::pair<unsigned, unsigned>
-GCNScheduleDAGMILive::getRealRegPressure() const {
- unsigned SGPRs, MaxSGPRs, VGPRs, MaxVGPRs;
- SGPRs = MaxSGPRs = LiveInPressure.first;
- VGPRs = MaxVGPRs = LiveInPressure.second;
-
- const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
- DenseMap<unsigned, LaneBitmask> LiveRegs(LiveIns);
+void GCNScheduleDAGMILive::finalizeSchedule() {
+ GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+ DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
- for (const MachineInstr &MI : *this) {
- if (MI.isDebugValue())
- continue;
- SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
- assert (SI.isValid());
+ LiveIns.resize(Regions.size());
+ Pressure.resize(Regions.size());
- // Remove dead registers or mask bits.
- for (auto &It : LiveRegs) {
- if (It.second.none())
- continue;
- const LiveInterval &LI = LIS->getInterval(It.first);
- if (LI.hasSubRanges()) {
- for (const auto &S : LI.subranges())
- if (!S.liveAt(SI))
- setMask(MRI, SRI, It.first, It.second, It.second & ~S.LaneMask,
- SGPRs, VGPRs);
- } else if (!LI.liveAt(SI)) {
- setMask(MRI, SRI, It.first, It.second, LaneBitmask::getNone(),
- SGPRs, VGPRs);
- }
- }
+ do {
+ Stage++;
+ RegionIdx = 0;
+ MachineBasicBlock *MBB = nullptr;
- // Add new registers or mask bits.
- for (const auto &MO : MI.defs()) {
- if (!MO.isReg())
- continue;
- unsigned Reg = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
- continue;
- unsigned SubRegIdx = MO.getSubReg();
- LaneBitmask LaneMask = SubRegIdx != 0
- ? TRI->getSubRegIndexLaneMask(SubRegIdx)
- : MRI.getMaxLaneMaskForVReg(Reg);
- LaneBitmask &LM = LiveRegs[Reg];
- setMask(MRI, SRI, Reg, LM, LM | LaneMask, SGPRs, VGPRs);
- }
- MaxSGPRs = std::max(MaxSGPRs, SGPRs);
- MaxVGPRs = std::max(MaxVGPRs, VGPRs);
- }
+ if (Stage > 1) {
+ // Retry function scheduling if we found resulting occupancy and it is
+ // lower than used for first pass scheduling. This will give more freedom
+ // to schedule low register pressure blocks.
+ // Code is partially copied from MachineSchedulerBase::scheduleRegions().
- DEBUG(dbgs() << "Real region's register pressure:\nSGPR = " << MaxSGPRs
- << "\nVGPR = " << MaxVGPRs << '\n');
+ if (!LIS || StartingOccupancy <= MinOccupancy)
+ break;
- return std::make_pair(MaxSGPRs, MaxVGPRs);
-}
+ DEBUG(dbgs()
+ << "Retrying function scheduling with lowest recorded occupancy "
+ << MinOccupancy << ".\n");
-void GCNScheduleDAGMILive::finalizeSchedule() {
- // Retry function scheduling if we found resulting occupancy and it is
- // lower than used for first pass scheduling. This will give more freedom
- // to schedule low register pressure blocks.
- // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+ S.setTargetOccupancy(MinOccupancy);
+ }
- if (!LIS || StartingOccupancy <= MinOccupancy)
- return;
+ for (auto Region : Regions) {
+ RegionBegin = Region.first;
+ RegionEnd = Region.second;
- DEBUG(dbgs() << "Retrying function scheduling with lowest recorded occupancy "
- << MinOccupancy << ".\n");
+ if (RegionBegin->getParent() != MBB) {
+ if (MBB) finishBlock();
+ MBB = RegionBegin->getParent();
+ startBlock(MBB);
+ if (Stage == 1)
+ computeBlockPressure(MBB);
+ }
- Stage++;
- GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
- S.setTargetOccupancy(MinOccupancy);
+ unsigned NumRegionInstrs = std::distance(begin(), end());
+ enterRegion(MBB, begin(), end(), NumRegionInstrs);
- MachineBasicBlock *MBB = nullptr;
- for (auto Region : Regions) {
- RegionBegin = Region.first;
- RegionEnd = Region.second;
+ // Skip empty scheduling regions (0 or 1 schedulable instructions).
+ if (begin() == end() || begin() == std::prev(end())) {
+ exitRegion();
+ continue;
+ }
- if (RegionBegin->getParent() != MBB) {
- if (MBB) finishBlock();
- MBB = RegionBegin->getParent();
- startBlock(MBB);
- }
+ DEBUG(dbgs() << "********** MI Scheduling **********\n");
+ DEBUG(dbgs() << MF.getName()
+ << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+ << "\n From: " << *begin() << " To: ";
+ if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+ else dbgs() << "End";
+ dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
- unsigned NumRegionInstrs = std::distance(begin(), end());
- enterRegion(MBB, begin(), end(), NumRegionInstrs);
+ schedule();
- // Skip empty scheduling regions (0 or 1 schedulable instructions).
- if (begin() == end() || begin() == std::prev(end())) {
exitRegion();
- continue;
+ ++RegionIdx;
}
- DEBUG(dbgs() << "********** MI Scheduling **********\n");
- DEBUG(dbgs() << MF.getName()
- << ":BB#" << MBB->getNumber() << " " << MBB->getName()
- << "\n From: " << *begin() << " To: ";
- if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
- else dbgs() << "End";
- dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+ finishBlock();
- schedule();
-
- exitRegion();
- }
- finishBlock();
- LiveIns.shrink_and_clear();
+ } while (Stage < 2);
}
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 15af232704ff..3ed3cd5b3b1c 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+#include "GCNRegPressure.h"
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
@@ -74,21 +75,28 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive {
// Scheduling stage number.
unsigned Stage;
+ // Current region index.
+ size_t RegionIdx;
+
// Vecor of regions recorder for later rescheduling
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;
- // Region live-ins.
- DenseMap<unsigned, LaneBitmask> LiveIns;
+ // Region live-in cache.
+ SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
+
+ // Region pressure cache.
+ SmallVector<GCNRegPressure, 32> Pressure;
+
+ // Temporary basic block live-in cache.
+ DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
- // Number of live-ins to the current region, first SGPR then VGPR.
- std::pair<unsigned, unsigned> LiveInPressure;
+ // Return current region pressure.
+ GCNRegPressure getRealRegPressure() const;
- // Collect current region live-ins.
- void discoverLiveIns();
+ // Compute and cache live-ins and pressure for all regions in block.
+ void computeBlockPressure(const MachineBasicBlock *MBB);
- // Return current region pressure. First value is SGPR number, second is VGPR.
- std::pair<unsigned, unsigned> getRealRegPressure() const;
public:
GCNScheduleDAGMILive(MachineSchedContext *C,
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index d8cb98fe1b19..8cb35c506135 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -126,7 +126,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
Void = Type::getVoidTy(Context);
Boolean = Type::getInt1Ty(Context);
Int64 = Type::getInt64Ty(Context);
- ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
+ ReturnStruct = StructType::get(Boolean, Int64);
BoolTrue = ConstantInt::getTrue(Context);
BoolFalse = ConstantInt::getFalse(Context);
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index cc93c27731ff..48a14e4dbea2 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -488,6 +488,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FCANONICALIZE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
@@ -2003,6 +2004,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
break;
}
assert(Found);
+ (void)Found;
// This should be before all vector instructions.
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
@@ -4604,6 +4606,24 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performExtractVectorEltCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SDValue Vec = N->getOperand(0);
+
+ SelectionDAG &DAG= DCI.DAG;
+ if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+ SDLoc SL(N);
+ EVT EltVT = N->getValueType(0);
+ SDValue Idx = N->getOperand(1);
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec.getOperand(0), Idx);
+ return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+ }
+
+ return SDValue();
+}
+
+
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
const SDNode *N1) const {
@@ -4891,6 +4911,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
break;
}
+ case ISD::EXTRACT_VECTOR_ELT:
+ return performExtractVectorEltCombine(N, DCI);
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index d177777ad5ee..046e677756d1 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -100,6 +100,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 92e452a3d6a0..065fd09eb356 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -496,6 +496,188 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
return Opcode;
}
+void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL, unsigned DestReg,
+ int64_t Value) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
+ if (RegClass == &AMDGPU::SReg_32RegClass ||
+ RegClass == &AMDGPU::SGPR_32RegClass ||
+ RegClass == &AMDGPU::SReg_32_XM0RegClass ||
+ RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+ .addImm(Value);
+ return;
+ }
+
+ if (RegClass == &AMDGPU::SReg_64RegClass ||
+ RegClass == &AMDGPU::SGPR_64RegClass ||
+ RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+ .addImm(Value);
+ return;
+ }
+
+ if (RegClass == &AMDGPU::VGPR_32RegClass) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+ .addImm(Value);
+ return;
+ }
+ if (RegClass == &AMDGPU::VReg_64RegClass) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
+ .addImm(Value);
+ return;
+ }
+
+ unsigned EltSize = 4;
+ unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ if (RI.isSGPRClass(RegClass)) {
+ if (RI.getRegSizeInBits(*RegClass) > 32) {
+ Opcode = AMDGPU::S_MOV_B64;
+ EltSize = 8;
+ } else {
+ Opcode = AMDGPU::S_MOV_B32;
+ EltSize = 4;
+ }
+ }
+
+ ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
+ for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+ int64_t IdxValue = Idx == 0 ? Value : 0;
+
+ MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+ get(Opcode), RI.getSubReg(DestReg, Idx));
+ Builder.addImm(IdxValue);
+ }
+}
+
+const TargetRegisterClass *
+SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
+ return &AMDGPU::VGPR_32RegClass;
+}
+
+void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, unsigned DstReg,
+ ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg,
+ unsigned FalseReg) const {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
+ "Not a VGPR32 reg");
+
+ if (Cond.size() == 1) {
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .add(Cond[0]);
+ } else if (Cond.size() == 2) {
+ assert(Cond[0].isImm() && "Cond[0] is not an immediate");
+ switch (Cond[0].getImm()) {
+ case SIInstrInfo::SCC_TRUE: {
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ .addImm(-1)
+ .addImm(0);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addReg(SReg);
+ break;
+ }
+ case SIInstrInfo::SCC_FALSE: {
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ .addImm(0)
+ .addImm(-1);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addReg(SReg);
+ break;
+ }
+ case SIInstrInfo::VCCNZ: {
+ MachineOperand RegOp = Cond[1];
+ RegOp.setImplicit(false);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .add(RegOp);
+ break;
+ }
+ case SIInstrInfo::VCCZ: {
+ MachineOperand RegOp = Cond[1];
+ RegOp.setImplicit(false);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(TrueReg)
+ .addReg(FalseReg)
+ .add(RegOp);
+ break;
+ }
+ case SIInstrInfo::EXECNZ: {
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+ .addImm(0);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ .addImm(-1)
+ .addImm(0);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addReg(SReg);
+ break;
+ }
+ case SIInstrInfo::EXECZ: {
+ unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+ .addImm(0);
+ BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+ .addImm(0)
+ .addImm(-1);
+ BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .addReg(FalseReg)
+ .addReg(TrueReg)
+ .addReg(SReg);
+ llvm_unreachable("Unhandled branch predicate EXECZ");
+ break;
+ }
+ default:
+ llvm_unreachable("invalid branch predicate");
+ }
+ } else {
+ llvm_unreachable("Can only handle Cond size 1 or 2");
+ }
+}
+
+unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ unsigned SrcReg, int Value) const {
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
+ .addImm(Value)
+ .addReg(SrcReg);
+
+ return Reg;
+}
+
+unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ unsigned SrcReg, int Value) const {
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
+ .addImm(Value)
+ .addReg(SrcReg);
+
+ return Reg;
+}
+
unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
if (RI.getRegSizeInBits(*DstRC) == 32) {
@@ -834,6 +1016,20 @@ void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
insertWaitStates(MBB, MI, 1);
}
+void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
+ auto MF = MBB.getParent();
+ SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+ assert(Info->isEntryFunction());
+
+ if (MBB.succ_empty()) {
+ bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
+ if (HasNoTerminator)
+ BuildMI(MBB, MBB.end(), DebugLoc(),
+ get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
+ }
+}
+
unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default: return 1; // FIXME: Do wait states equal cycles?
@@ -1241,14 +1437,20 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
return false;
}
- BranchPredicate Pred = getBranchPredicate(I->getOpcode());
- if (Pred == INVALID_BR)
- return true;
+ MachineBasicBlock *CondBB = nullptr;
- MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
- Cond.push_back(MachineOperand::CreateImm(Pred));
- Cond.push_back(I->getOperand(1)); // Save the branch register.
+ if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+ CondBB = I->getOperand(1).getMBB();
+ Cond.push_back(I->getOperand(0));
+ } else {
+ BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+ if (Pred == INVALID_BR)
+ return true;
+ CondBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(Pred));
+ Cond.push_back(I->getOperand(1)); // Save the branch register.
+ }
++I;
if (I == MBB.end()) {
@@ -1351,6 +1553,13 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
return 1;
}
+ if(Cond.size() == 1 && Cond[0].isReg()) {
+ BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
+ .add(Cond[0])
+ .addMBB(TBB);
+ return 1;
+ }
+
assert(TBB && Cond[0].isImm());
unsigned Opcode
@@ -1390,9 +1599,16 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
bool SIInstrInfo::reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const {
- assert(Cond.size() == 2);
- Cond[0].setImm(-Cond[0].getImm());
- return false;
+ if (Cond.size() != 2) {
+ return true;
+ }
+
+ if (Cond[0].isImm()) {
+ Cond[0].setImm(-Cond[0].getImm());
+ return false;
+ }
+
+ return true;
}
bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
@@ -3920,6 +4136,82 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
return false;
}
+bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
+ return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
+}
+
+void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+ MachineBasicBlock *IfEnd) const {
+ MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
+ assert(TI != IfEntry->end());
+
+ MachineInstr *Branch = &(*TI);
+ MachineFunction *MF = IfEntry->getParent();
+ MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
+
+ if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+ unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ MachineInstr *SIIF =
+ BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
+ .add(Branch->getOperand(0))
+ .add(Branch->getOperand(1));
+ MachineInstr *SIEND =
+ BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
+ .addReg(DstReg);
+
+ IfEntry->erase(TI);
+ IfEntry->insert(IfEntry->end(), SIIF);
+ IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
+ }
+}
+
+void SIInstrInfo::convertNonUniformLoopRegion(
+ MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
+ MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
+ // We expect 2 terminators, one conditional and one unconditional.
+ assert(TI != LoopEnd->end());
+
+ MachineInstr *Branch = &(*TI);
+ MachineFunction *MF = LoopEnd->getParent();
+ MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
+
+ if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+
+ unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ MachineInstrBuilder HeaderPHIBuilder =
+ BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
+ for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
+ E = LoopEntry->pred_end();
+ PI != E; ++PI) {
+ if (*PI == LoopEnd) {
+ HeaderPHIBuilder.addReg(BackEdgeReg);
+ } else {
+ MachineBasicBlock *PMBB = *PI;
+ unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
+ ZeroReg, 0);
+ HeaderPHIBuilder.addReg(ZeroReg);
+ }
+ HeaderPHIBuilder.addMBB(*PI);
+ }
+ MachineInstr *HeaderPhi = HeaderPHIBuilder;
+ MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
+ get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
+ .addReg(DstReg)
+ .add(Branch->getOperand(0));
+ MachineInstr *SILOOP =
+ BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
+ .addReg(BackEdgeReg)
+ .addMBB(LoopEntry);
+
+ LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
+ LoopEnd->erase(TI);
+ LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
+ LoopEnd->insert(LoopEnd->end(), SILOOP);
+ }
+}
+
ArrayRef<std::pair<int, const char *>>
SIInstrInfo::getSerializableTargetIndices() const {
static const std::pair<int, const char *> TargetIndices[] = {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 03a5ef74b179..f6e5e8883f63 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -143,6 +143,23 @@ public:
RegScavenger *RS, unsigned TmpReg,
unsigned Offset, unsigned Size) const;
+ void materializeImmediate(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const DebugLoc &DL,
+ unsigned DestReg,
+ int64_t Value) const;
+
+ const TargetRegisterClass *getPreferredSelectRegClass(
+ unsigned Size) const;
+
+ unsigned insertNE(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ unsigned SrcReg, int Value) const;
+
+ unsigned insertEQ(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ unsigned SrcReg, int Value) const;
+
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, unsigned SrcReg,
bool isKill, int FrameIndex,
@@ -193,7 +210,7 @@ public:
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify) const override;
+ bool AllowModify = false) const override;
unsigned removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved = nullptr) const override;
@@ -218,6 +235,11 @@ public:
unsigned DstReg, ArrayRef<MachineOperand> Cond,
unsigned TrueReg, unsigned FalseReg) const override;
+ void insertVectorSelect(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, const DebugLoc &DL,
+ unsigned DstReg, ArrayRef<MachineOperand> Cond,
+ unsigned TrueReg, unsigned FalseReg) const;
+
bool
areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override;
@@ -705,6 +727,7 @@ public:
void insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
+ void insertReturn(MachineBasicBlock &MBB) const;
/// \brief Return the number of wait states that result from executing this
/// instruction.
unsigned getNumWaitStates(const MachineInstr &MI) const;
@@ -750,6 +773,14 @@ public:
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const;
+ bool isNonUniformBranchInstr(MachineInstr &Instr) const;
+
+ void convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+ MachineBasicBlock *IfEnd) const;
+
+ void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry,
+ MachineBasicBlock *LoopEnd) const;
+
ArrayRef<std::pair<int, const char *>>
getSerializableTargetIndices() const override;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 7ccb54f54e34..3b4bdc864253 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -174,6 +174,13 @@ def SI_MASK_BRANCH : VPseudoInstSI <
let isTerminator = 1 in {
+ def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
+ (outs),
+ (ins SReg_64:$vcc, brtarget:$target),
+ [(brcond i1:$vcc, bb:$target)]> {
+ let Size = 12;
+}
+
def SI_IF: CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
[(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 2281f338ab45..4a11d9471f1d 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -164,8 +164,11 @@ multiclass VOP2eInst <string opName,
class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
- field string Asm32 = "$vdst, $src0, $src1, $imm";
field bit HasExt = 0;
+
+ // Hack to stop printing _e64
+ let DstRC = RegisterOperand<VGPR_32>;
+ field string Asm32 = " $vdst, $src0, $src1, $imm";
}
def VOP_MADAK_F16 : VOP_MADAK <f16>;
@@ -174,8 +177,11 @@ def VOP_MADAK_F32 : VOP_MADAK <f32>;
class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
- field string Asm32 = "$vdst, $src0, $imm, $src1";
field bit HasExt = 0;
+
+ // Hack to stop printing _e64
+ let DstRC = RegisterOperand<VGPR_32>;
+ field string Asm32 = " $vdst, $src0, $imm, $src1";
}
def VOP_MADMK_F16 : VOP_MADMK <f16>;
@@ -298,7 +304,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
let SubtargetPredicate = isGCN in {
defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
-def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">;
let isCommutable = 1 in {
defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
@@ -328,7 +334,7 @@ let Constraints = "$vdst = $src2", DisableEncoding="$src2",
defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
}
-def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>;
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">;
// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
@@ -383,7 +389,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
let SubtargetPredicate = isVI in {
-def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>;
+def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
@@ -394,7 +400,7 @@ defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
-def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>;
+def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
@@ -651,6 +657,17 @@ multiclass VOP2_Real_e64_vi <bits<10> op> {
VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
+multiclass VOP2_Real_e64only_vi <bits<10> op> {
+ def _e64_vi :
+ VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+ VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+ // Hack to stop printing _e64
+ VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64");
+ let OutOperandList = (outs VGPR_32:$vdst);
+ let AsmString = ps.Mnemonic # " " # ps.AsmOperands;
+ }
+}
+
multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> {
def _e64_vi :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
@@ -718,17 +735,17 @@ defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi <0x1e>;
defm V_READLANE_B32 : VOP32_Real_vi <0x289>;
defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>;
-defm V_BFM_B32 : VOP2_Real_e64_vi <0x293>;
-defm V_BCNT_U32_B32 : VOP2_Real_e64_vi <0x28b>;
-defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64_vi <0x28c>;
-defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64_vi <0x28d>;
-defm V_LDEXP_F32 : VOP2_Real_e64_vi <0x288>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>;
-defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>;
-defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64_vi <0x296>;
-defm V_CVT_PK_U16_U32 : VOP2_Real_e64_vi <0x297>;
-defm V_CVT_PK_I16_I32 : VOP2_Real_e64_vi <0x298>;
+defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>;
+defm V_BCNT_U32_B32 : VOP2_Real_e64only_vi <0x28b>;
+defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64only_vi <0x28c>;
+defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64only_vi <0x28d>;
+defm V_LDEXP_F32 : VOP2_Real_e64only_vi <0x288>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64only_vi <0x1f0>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64only_vi <0x294>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64only_vi <0x295>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64only_vi <0x296>;
+defm V_CVT_PK_U16_U32 : VOP2_Real_e64only_vi <0x297>;
+defm V_CVT_PK_I16_I32 : VOP2_Real_e64only_vi <0x298>;
defm V_ADD_F16 : VOP2_Real_e32e64_vi <0x1f>;
defm V_SUB_F16 : VOP2_Real_e32e64_vi <0x20>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 217a07488853..ffa6c60d6b1f 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -232,7 +232,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
let SubtargetPredicate = isCIVI in {
-def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>;
def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
@@ -402,7 +401,6 @@ multiclass VOP3be_Real_ci<bits<9> op> {
}
}
-defm V_MQSAD_U16_U8 : VOP3_Real_ci <0x172>;
defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>;
defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x175>;
defm V_MAD_U64_U32 : VOP3be_Real_ci <0x176>;
@@ -426,7 +424,6 @@ multiclass VOP3be_Real_vi<bits<10> op> {
} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
-defm V_MQSAD_U16_U8 : VOP3_Real_vi <0x172>;
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 28c407f74125..dd7fe871345a 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -404,21 +404,11 @@ public:
/// Returns predicate register associated with the given frame instruction.
unsigned getFramePred(const MachineInstr &MI) const {
assert(isFrameInstr(MI));
- if (isFrameSetup(MI))
- // Operands of ADJCALLSTACKDOWN:
- // - argument declared in ADJCALLSTACKDOWN pattern:
- // 0 - frame size
- // 1 - predicate code (like ARMCC::AL)
- // - added by predOps:
- // 2 - predicate reg
- return MI.getOperand(2).getReg();
- assert(MI.getOpcode() == ARM::ADJCALLSTACKUP ||
- MI.getOpcode() == ARM::tADJCALLSTACKUP);
- // Operands of ADJCALLSTACKUP:
- // - argument declared in ADJCALLSTACKUP pattern:
+ // Operands of ADJCALLSTACKDOWN/ADJCALLSTACKUP:
+ // - argument declared in the pattern:
// 0 - frame size
- // 1 - arg of CALLSEQ_END
- // 2 - predicate code
+ // 1 - arg of CALLSEQ_START/CALLSEQ_END
+ // 2 - predicate code (like ARMCC::AL)
// - added by predOps:
// 3 - predicate reg
return MI.getOperand(3).getReg();
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 9178c67afa6e..46ac4d0ad933 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -433,7 +433,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// We now know the size of the stack - update the ADJCALLSTACKDOWN
// accordingly.
- CallSeqStart.addImm(ArgHandler.StackSize).add(predOps(ARMCC::AL));
+ CallSeqStart.addImm(ArgHandler.StackSize).addImm(0).add(predOps(ARMCC::AL));
MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP)
.addImm(ArgHandler.StackSize)
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 56cac855620d..4f6a73b5980d 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -1949,7 +1949,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(AdjStackDown))
- .addImm(NumBytes));
+ .addImm(NumBytes).addImm(0));
// Process the args.
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e64582402fe1..f8b584db7b99 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -473,9 +473,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
// Use divmod compiler-rt calls for iOS 5.0 and later.
- if (Subtarget->isTargetWatchOS() ||
- (Subtarget->isTargetIOS() &&
- !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
+ if (Subtarget->isTargetMachO() &&
+ !(Subtarget->isTargetIOS() &&
+ Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
}
@@ -1817,8 +1817,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!isSibCall)
- Chain = DAG.getCALLSEQ_START(Chain,
- DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue StackPtr =
DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
@@ -7365,7 +7364,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Pair of floats / doubles used to pass the result.
- Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+ Type *RetTy = StructType::get(ArgTy, ArgTy);
auto &DL = DAG.getDataLayout();
ArgListTy Args;
@@ -13115,7 +13114,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
- Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
+ Type *RetTy = StructType::get(Ty, Ty);
if (Subtarget->isTargetWindows())
InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
@@ -13417,9 +13416,9 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
}
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
- AtomicOrdering Ord, bool IsStore,
- bool IsLoad) const {
+Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -13428,7 +13427,7 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
case AtomicOrdering::Acquire:
return nullptr; // Nothing to do
case AtomicOrdering::SequentiallyConsistent:
- if (!IsStore)
+ if (!Inst->hasAtomicStore())
return nullptr; // Nothing to do
/*FALLTHROUGH*/
case AtomicOrdering::Release:
@@ -13442,9 +13441,9 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
llvm_unreachable("Unknown fence ordering in emitLeadingFence");
}
-Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
- AtomicOrdering Ord, bool IsStore,
- bool IsLoad) const {
+Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 08c51b66dfe7..875c06210ae6 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -483,10 +483,10 @@ class InstrItineraryData;
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
- Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
- bool IsStore, bool IsLoad) const override;
- Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
- bool IsStore, bool IsLoad) const override;
+ Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index a94d6048f02d..d06b7d0896f1 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -16,7 +16,8 @@
//
// Type profiles.
-def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
def SDT_ARMStructByVal : SDTypeProfile<0, 4,
[SDTCisVT<0, i32>, SDTCisVT<1, i32>,
@@ -1968,8 +1969,8 @@ PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
[(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKDOWN :
-PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
- [(ARMcallseq_start timm:$amt)]>;
+PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary,
+ [(ARMcallseq_start timm:$amt, timm:$amt2)]>;
}
def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 8048c758e998..bee83dfb6f63 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -284,8 +284,8 @@ def tADJCALLSTACKUP :
Requires<[IsThumb, IsThumb1Only]>;
def tADJCALLSTACKDOWN :
- PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
- [(ARMcallseq_start imm:$amt)]>,
+ PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2), NoItinerary,
+ [(ARMcallseq_start imm:$amt, imm:$amt2)]>,
Requires<[IsThumb, IsThumb1Only]>;
}
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 2ac3fda9f448..8c680cdf9b47 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -101,14 +101,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
assert(RegBank && "Can't get reg bank for virtual register");
const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
- (void)DstSize;
- unsigned SrcReg = I.getOperand(1).getReg();
- const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
- (void)SrcSize;
- // We use copies for trunc, so it's ok for the size of the destination to be
- // smaller (the higher bits will just be undefined).
- assert(DstSize <= SrcSize && "Copy with different width?!");
-
assert((RegBank->getID() == ARM::GPRRegBankID ||
RegBank->getID() == ARM::FPRRegBankID) &&
"Unsupported reg bank");
@@ -135,28 +127,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
return true;
}
-static bool selectFAdd(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
- MachineRegisterInfo &MRI) {
- assert(TII.getSubtarget().hasVFP2() && "Can't select fp add without vfp");
-
- LLT Ty = MRI.getType(MIB->getOperand(0).getReg());
- unsigned ValSize = Ty.getSizeInBits();
-
- if (ValSize == 32) {
- if (TII.getSubtarget().useNEONForSinglePrecisionFP())
- return false;
- MIB->setDesc(TII.get(ARM::VADDS));
- } else {
- assert(ValSize == 64 && "Unsupported size for floating point value");
- if (TII.getSubtarget().isFPOnlySP())
- return false;
- MIB->setDesc(TII.get(ARM::VADDD));
- }
- MIB.add(predOps(ARMCC::AL));
-
- return true;
-}
-
static bool selectSequence(MachineInstrBuilder &MIB,
const ARMBaseInstrInfo &TII,
MachineRegisterInfo &MRI,
@@ -352,6 +322,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
}
break;
}
+ case G_ANYEXT:
case G_TRUNC: {
// The high bits are undefined, so there's nothing special to do, just
// treat it as a copy.
@@ -362,12 +333,12 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
if (SrcRegBank.getID() != DstRegBank.getID()) {
- DEBUG(dbgs() << "G_TRUNC operands on different register banks\n");
+ DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");
return false;
}
if (SrcRegBank.getID() != ARM::GPRRegBankID) {
- DEBUG(dbgs() << "G_TRUNC on non-GPR not supported yet\n");
+ DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n");
return false;
}
@@ -393,10 +364,6 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
}
MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
break;
- case G_FADD:
- if (!selectFAdd(MIB, TII, MRI))
- return false;
- break;
case G_FRAME_INDEX:
// Add 0 to the given frame index and hope it will eventually be folded into
// the user(s).
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 9b86030fdd29..5bf6c7aed6b8 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -45,9 +45,11 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
setAction({Op, 1, p0}, Legal);
}
- for (unsigned Op : {G_ADD, G_SUB, G_MUL})
- for (auto Ty : {s1, s8, s16, s32})
- setAction({Op, Ty}, Legal);
+ for (unsigned Op : {G_ADD, G_SUB, G_MUL}) {
+ for (auto Ty : {s1, s8, s16})
+ setAction({Op, Ty}, WidenScalar);
+ setAction({Op, s32}, Legal);
+ }
for (unsigned Op : {G_SDIV, G_UDIV}) {
for (auto Ty : {s8, s16})
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index 581d5fe159fd..7e4d598a6e0b 100644
--- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -88,13 +88,15 @@ bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
}
}
}
+ bool Changed = false;
// Remove the tagged DMB
for (auto MI : ToRemove) {
MI->eraseFromParent();
++NumDMBsRemoved;
+ Changed = true;
}
- return NumDMBsRemoved > 0;
+ return Changed;
}
/// createARMOptimizeBarriersPass - Returns an instance of the remove double
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 13a32211f88c..a20997c95cd9 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -225,6 +225,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case G_UDIV:
case G_SEXT:
case G_ZEXT:
+ case G_ANYEXT:
case G_TRUNC:
case G_GEP:
// FIXME: We're abusing the fact that everything lives in a GPR for now; in
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index d09f3ecbaa28..5583d6148b08 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -13,7 +13,9 @@
#include "ARM.h"
#include "ARMCallLowering.h"
#include "ARMLegalizerInfo.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
#include "ARMRegisterBankInfo.h"
+#endif
#include "ARMSubtarget.h"
#include "ARMTargetMachine.h"
#include "ARMTargetObjectFile.h"
diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp
index c297865db820..0ec8e8b08ceb 100644
--- a/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/lib/Target/AVR/AVRFrameLowering.cpp
@@ -375,7 +375,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
DebugLoc DL = MI->getDebugLoc();
unsigned int Opcode = MI->getOpcode();
- int Amount = MI->getOperand(0).getImm();
+ int Amount = TII.getFrameSize(*MI);
// Adjcallstackup does not need to allocate stack space for the call, instead
// we insert push instructions that will allocate the necessary stack.
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index f0ab6acedad1..ef9c00e4b784 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -361,7 +361,7 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
getPointerTy(DAG.getDataLayout()));
- Type *RetTy = (Type *)StructType::get(Ty, Ty, nullptr);
+ Type *RetTy = (Type *)StructType::get(Ty, Ty);
SDLoc dl(Op);
TargetLowering::CallLoweringInfo CLI(DAG);
@@ -1166,8 +1166,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
@@ -1611,8 +1610,9 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineFunction::iterator I = MBB->getParent()->begin();
- ++I;
+ MachineFunction::iterator I;
+ for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I);
+ if (I != MF->end()) ++I;
MF->insert(I, trueMBB);
MF->insert(I, falseMBB);
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index 1b6547ef7795..06ad2b3ffdf8 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -17,7 +17,7 @@ include "AVRInstrFormats.td"
// AVR Type Profiles
//===----------------------------------------------------------------------===//
-def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
@@ -333,9 +333,9 @@ let Defs = [SP, SREG],
Uses = [SP] in
{
def ADJCALLSTACKDOWN : Pseudo<(outs),
- (ins i16imm:$amt),
+ (ins i16imm:$amt, i16imm:$amt2),
"#ADJCALLSTACKDOWN",
- [(AVRcallseq_start timm:$amt)]>;
+ [(AVRcallseq_start timm:$amt, timm:$amt2)]>;
// R31R30 is used to update SP, since it is a scratch reg and this instruction
// is placed after the function call then R31R30 should be always free.
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index 2813e24d2ac7..11a47bad78ba 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -52,7 +52,6 @@ AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
- const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
// Reserve the intermediate result registers r1 and r2
// The result of instructions like 'mul' is always stored here.
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index b9b3dff95c0a..6897161c903c 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -257,8 +257,7 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
auto PtrVT = getPointerTy(MF.getDataLayout());
- Chain = DAG.getCALLSEQ_START(
- Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
SmallVector<std::pair<unsigned, SDValue>, MaxArgs> RegsToPass;
diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td
index 93ee24371c4d..c6c0ff587c6b 100644
--- a/lib/Target/BPF/BPFInstrInfo.td
+++ b/lib/Target/BPF/BPFInstrInfo.td
@@ -16,7 +16,8 @@ include "BPFInstrFormats.td"
// Instruction Operands and Patterns
// These are target-independent nodes, but have target-specific formats.
-def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
+ SDTCisVT<1, iPTR>]>;
def SDT_BPFCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
def SDT_BPFCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
def SDT_BPFSetFlag : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>]>;
@@ -445,9 +446,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
// ADJCALLSTACKDOWN/UP pseudo insns
let Defs = [R11], Uses = [R11] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
- "#ADJCALLSTACKDOWN $amt",
- [(BPFcallseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+ "#ADJCALLSTACKDOWN $amt1 $amt2",
+ [(BPFcallseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
"#ADJCALLSTACKUP $amt1 $amt2",
[(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 861af94f1e38..1dffebe97f2d 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -848,8 +848,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue Glue;
if (!IsTailCall) {
- SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
- Chain = DAG.getCALLSEQ_START(Chain, C, dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
Glue = Chain.getValue(1);
}
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 5a5799dbe009..e4df7ff5c200 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1209,7 +1209,7 @@ bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
KnownBits Known(T->getBitWidth());
computeKnownBits(V, Known, DL);
- return Known.Zero.countLeadingOnes() >= IterCount;
+ return Known.countMinLeadingZeros() >= IterCount;
}
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index 32503d111c24..81b5e10c1173 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -714,7 +714,8 @@ def: Pat<(i1 0), (PS_false)>;
def: Pat<(i1 1), (PS_true)>;
// Pseudo instructions.
-def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
SDTCisVT<1, i32> ]>;
@@ -732,8 +733,8 @@ def HexagonTCRet : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def: Pat<(callseq_start timm:$amt),
- (ADJCALLSTACKDOWN imm:$amt)>;
+def: Pat<(callseq_start timm:$amt, timm:$amt2),
+ (ADJCALLSTACKDOWN imm:$amt, imm:$amt2)>;
def: Pat<(callseq_end timm:$amt1, timm:$amt2),
(ADJCALLSTACKUP imm:$amt1, imm:$amt2)>;
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index 8c2caea2d5c5..0f99dfe342b8 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -80,7 +80,7 @@ def PS_false : InstHexagon<(outs PredRegs:$dst), (ins), "",
[(set I1:$dst, 0)], "", C2_andn.Itinerary, TypeCR>;
let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
".error \"should not emit\" ", []>;
let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp
index d156294a0b0c..0a9cac2565f2 100644
--- a/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -11,9 +11,9 @@
//
//===----------------------------------------------------------------------===//
+#include "LanaiISelLowering.h"
#include "Lanai.h"
#include "LanaiCondCode.h"
-#include "LanaiISelLowering.h"
#include "LanaiMachineFunctionInfo.h"
#include "LanaiSubtarget.h"
#include "LanaiTargetObjectFile.h"
@@ -38,10 +38,11 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetCallingConv.h"
@@ -649,10 +650,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo(
ByValArgs.push_back(FIPtr);
}
- Chain = DAG.getCALLSEQ_START(
- Chain,
- DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
SmallVector<SDValue, 12> MemOpChains;
@@ -1502,3 +1500,24 @@ SDValue LanaiTargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
}
+
+void LanaiTargetLowering::computeKnownBitsForTargetNode(
+ const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+ const SelectionDAG &DAG, unsigned Depth) const {
+ unsigned BitWidth = Known.getBitWidth();
+ switch (Op.getOpcode()) {
+ default:
+ break;
+ case LanaiISD::SETCC:
+ Known = KnownBits(BitWidth);
+ Known.Zero.setBits(1, BitWidth);
+ break;
+ case LanaiISD::SELECT_CC:
+ KnownBits Known2;
+ DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
+ DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
+ Known.Zero &= Known2.Zero;
+ Known.One &= Known2.One;
+ break;
+ }
+}
diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h
index c2fba4f9d167..49ad52a39771 100644
--- a/lib/Target/Lanai/LanaiISelLowering.h
+++ b/lib/Target/Lanai/LanaiISelLowering.h
@@ -106,6 +106,11 @@ public:
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
private:
SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
CallingConv::ID CallConv, bool IsVarArg,
diff --git a/lib/Target/Lanai/LanaiInstrInfo.td b/lib/Target/Lanai/LanaiInstrInfo.td
index 285fca11737d..776fee101dfe 100644
--- a/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/lib/Target/Lanai/LanaiInstrInfo.td
@@ -22,7 +22,8 @@ include "LanaiInstrFormats.td"
// -------------------------------------------------- //
// These are target-independent nodes, but have target-specific formats.
-def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
def SDT_LanaiCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
SDTCisVT<1, i32>]>;
def SDT_LanaiCall : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
@@ -750,9 +751,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber SP.
let Defs = [SP], Uses = [SP] in {
- def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
- "#ADJCALLSTACKDOWN $amt",
- [(CallSeqStart timm:$amt)]>;
+ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKDOWN $amt1 $amt2",
+ [(CallSeqStart timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKUP $amt1 $amt2",
[(CallSeqEnd timm:$amt1, timm:$amt2)]>;
@@ -770,9 +771,6 @@ let Uses = [SR] in {
[(set (i32 GPR:$Rs1), (LanaiSetCC imm:$DDDI))]>;
}
-// SCC's output is already 1-bit so and'ing with 1 is redundant.
-def : Pat<(and (LanaiSetCC imm:$DDDI), 1), (SCC imm:$DDDI)>;
-
// Select with hardware support
let Uses = [SR], isSelect = 1 in {
def SELECT : InstRR<0b111, (outs GPR:$Rd),
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index f1cb0b6c031b..b4ff8f66c55f 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -236,7 +236,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
// adjcallstackdown instruction into 'add SP, <amt>'
// TODO: consider using push / pop instead of sub + store / add
MachineInstr &Old = *I;
- uint64_t Amount = Old.getOperand(0).getImm();
+ uint64_t Amount = TII.getFrameSize(Old);
if (Amount != 0) {
// We need to keep the stack aligned properly. To do this, we round the
// amount of space needed for the outgoing arguments up to the next
@@ -252,8 +252,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
} else {
assert(Old.getOpcode() == TII.getCallFrameDestroyOpcode());
// factor out the amount the callee already popped.
- uint64_t CalleeAmt = Old.getOperand(1).getImm();
- Amount -= CalleeAmt;
+ Amount -= TII.getFramePoppedByCallee(Old);
if (Amount)
New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::ADD16ri),
MSP430::SP)
@@ -272,7 +271,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
} else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
// If we are performing frame pointer elimination and if the callee pops
// something off the stack pointer, add it back.
- if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+ if (uint64_t CalleeAmt = TII.getFramePoppedByCallee(*I)) {
MachineInstr &Old = *I;
MachineInstr *New =
BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 40b1dd3cc2eb..cc6e64043f54 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -40,21 +40,24 @@ using namespace llvm;
typedef enum {
NoHWMult,
- HWMultIntr,
- HWMultNoIntr
+ HWMult16,
+ HWMult32,
+ HWMultF5
} HWMultUseMode;
static cl::opt<HWMultUseMode>
-HWMultMode("msp430-hwmult-mode", cl::Hidden,
+HWMultMode("mhwmult", cl::Hidden,
cl::desc("Hardware multiplier use mode"),
- cl::init(HWMultNoIntr),
+ cl::init(NoHWMult),
cl::values(
- clEnumValN(NoHWMult, "no",
+ clEnumValN(NoHWMult, "none",
"Do not use hardware multiplier"),
- clEnumValN(HWMultIntr, "interrupts",
- "Assume hardware multiplier can be used inside interrupts"),
- clEnumValN(HWMultNoIntr, "use",
- "Assume hardware multiplier cannot be used inside interrupts")));
+ clEnumValN(HWMult16, "16bit",
+ "Use 16-bit hardware multiplier"),
+ clEnumValN(HWMult32, "32bit",
+ "Use 32-bit hardware multiplier"),
+ clEnumValN(HWMultF5, "f5series",
+ "Use F5 series hardware multiplier")));
MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
const MSP430Subtarget &STI)
@@ -131,29 +134,29 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
// FIXME: Implement efficiently multiplication by a constant
- setOperationAction(ISD::MUL, MVT::i8, Expand);
- setOperationAction(ISD::MULHS, MVT::i8, Expand);
- setOperationAction(ISD::MULHU, MVT::i8, Expand);
- setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand);
- setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand);
- setOperationAction(ISD::MUL, MVT::i16, Expand);
+ setOperationAction(ISD::MUL, MVT::i8, Promote);
+ setOperationAction(ISD::MULHS, MVT::i8, Promote);
+ setOperationAction(ISD::MULHU, MVT::i8, Promote);
+ setOperationAction(ISD::SMUL_LOHI, MVT::i8, Promote);
+ setOperationAction(ISD::UMUL_LOHI, MVT::i8, Promote);
+ setOperationAction(ISD::MUL, MVT::i16, LibCall);
setOperationAction(ISD::MULHS, MVT::i16, Expand);
setOperationAction(ISD::MULHU, MVT::i16, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
- setOperationAction(ISD::UDIV, MVT::i8, Expand);
- setOperationAction(ISD::UDIVREM, MVT::i8, Expand);
- setOperationAction(ISD::UREM, MVT::i8, Expand);
- setOperationAction(ISD::SDIV, MVT::i8, Expand);
- setOperationAction(ISD::SDIVREM, MVT::i8, Expand);
- setOperationAction(ISD::SREM, MVT::i8, Expand);
- setOperationAction(ISD::UDIV, MVT::i16, Expand);
+ setOperationAction(ISD::UDIV, MVT::i8, Promote);
+ setOperationAction(ISD::UDIVREM, MVT::i8, Promote);
+ setOperationAction(ISD::UREM, MVT::i8, Promote);
+ setOperationAction(ISD::SDIV, MVT::i8, Promote);
+ setOperationAction(ISD::SDIVREM, MVT::i8, Promote);
+ setOperationAction(ISD::SREM, MVT::i8, Promote);
+ setOperationAction(ISD::UDIV, MVT::i16, LibCall);
setOperationAction(ISD::UDIVREM, MVT::i16, Expand);
- setOperationAction(ISD::UREM, MVT::i16, Expand);
- setOperationAction(ISD::SDIV, MVT::i16, Expand);
+ setOperationAction(ISD::UREM, MVT::i16, LibCall);
+ setOperationAction(ISD::SDIV, MVT::i16, LibCall);
setOperationAction(ISD::SDIVREM, MVT::i16, Expand);
- setOperationAction(ISD::SREM, MVT::i16, Expand);
+ setOperationAction(ISD::SREM, MVT::i16, LibCall);
// varargs support
setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -162,15 +165,183 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::JumpTable, MVT::i16, Custom);
- // Libcalls names.
- if (HWMultMode == HWMultIntr) {
- setLibcallName(RTLIB::MUL_I8, "__mulqi3hw");
- setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
- } else if (HWMultMode == HWMultNoIntr) {
- setLibcallName(RTLIB::MUL_I8, "__mulqi3hw_noint");
- setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+ // EABI Libcalls - EABI Section 6.2
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const ISD::CondCode Cond;
+ } LibraryCalls[] = {
+ // Floating point conversions - EABI Table 6
+ { RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf", ISD::SETCC_INVALID },
+ { RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd", ISD::SETCC_INVALID },
+ // The following is NOT implemented in libgcc
+ //{ RTLIB::FPTOSINT_F64_I16, "__mspabi_fixdi", ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli", ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli", ISD::SETCC_INVALID },
+ // The following is NOT implemented in libgcc
+ //{ RTLIB::FPTOUINT_F64_I16, "__mspabi_fixdu", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull", ISD::SETCC_INVALID },
+ // The following is NOT implemented in libgcc
+ //{ RTLIB::FPTOSINT_F32_I16, "__mspabi_fixfi", ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli", ISD::SETCC_INVALID },
+ { RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli", ISD::SETCC_INVALID },
+ // The following is NOT implemented in libgcc
+ //{ RTLIB::FPTOUINT_F32_I16, "__mspabi_fixfu", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful", ISD::SETCC_INVALID },
+ { RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc
+ //{ RTLIB::SINTTOFP_I16_F64, "__mspabi_fltid", ISD::SETCC_INVALID },
+ { RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc but is not in the EABI
+ { RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc
+ //{ RTLIB::UINTTOFP_I16_F64, "__mspabi_fltud", ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld", ISD::SETCC_INVALID },
+ // The following IS implemented in libgcc but is not in the EABI
+ { RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc
+ //{ RTLIB::SINTTOFP_I16_F32, "__mspabi_fltif", ISD::SETCC_INVALID },
+ { RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc but is not in the EABI
+ { RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif", ISD::SETCC_INVALID },
+ // TODO The following IS implemented in libgcc
+ //{ RTLIB::UINTTOFP_I16_F32, "__mspabi_fltuf", ISD::SETCC_INVALID },
+ { RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf", ISD::SETCC_INVALID },
+ // The following IS implemented in libgcc but is not in the EABI
+ { RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf", ISD::SETCC_INVALID },
+
+ // Floating point comparisons - EABI Table 7
+ { RTLIB::OEQ_F64, "__mspabi_cmpd", ISD::SETEQ },
+ { RTLIB::UNE_F64, "__mspabi_cmpd", ISD::SETNE },
+ { RTLIB::OGE_F64, "__mspabi_cmpd", ISD::SETGE },
+ { RTLIB::OLT_F64, "__mspabi_cmpd", ISD::SETLT },
+ { RTLIB::OLE_F64, "__mspabi_cmpd", ISD::SETLE },
+ { RTLIB::OGT_F64, "__mspabi_cmpd", ISD::SETGT },
+ { RTLIB::OEQ_F32, "__mspabi_cmpf", ISD::SETEQ },
+ { RTLIB::UNE_F32, "__mspabi_cmpf", ISD::SETNE },
+ { RTLIB::OGE_F32, "__mspabi_cmpf", ISD::SETGE },
+ { RTLIB::OLT_F32, "__mspabi_cmpf", ISD::SETLT },
+ { RTLIB::OLE_F32, "__mspabi_cmpf", ISD::SETLE },
+ { RTLIB::OGT_F32, "__mspabi_cmpf", ISD::SETGT },
+
+ // Floating point arithmetic - EABI Table 8
+ { RTLIB::ADD_F64, "__mspabi_addd", ISD::SETCC_INVALID },
+ { RTLIB::ADD_F32, "__mspabi_addf", ISD::SETCC_INVALID },
+ { RTLIB::DIV_F64, "__mspabi_divd", ISD::SETCC_INVALID },
+ { RTLIB::DIV_F32, "__mspabi_divf", ISD::SETCC_INVALID },
+ { RTLIB::MUL_F64, "__mspabi_mpyd", ISD::SETCC_INVALID },
+ { RTLIB::MUL_F32, "__mspabi_mpyf", ISD::SETCC_INVALID },
+ { RTLIB::SUB_F64, "__mspabi_subd", ISD::SETCC_INVALID },
+ { RTLIB::SUB_F32, "__mspabi_subf", ISD::SETCC_INVALID },
+ // The following are NOT implemented in libgcc
+ // { RTLIB::NEG_F64, "__mspabi_negd", ISD::SETCC_INVALID },
+ // { RTLIB::NEG_F32, "__mspabi_negf", ISD::SETCC_INVALID },
+
+ // TODO: SLL/SRA/SRL are in libgcc, RLL isn't
+
+ // Universal Integer Operations - EABI Table 9
+ { RTLIB::SDIV_I16, "__mspabi_divi", ISD::SETCC_INVALID },
+ { RTLIB::SDIV_I32, "__mspabi_divli", ISD::SETCC_INVALID },
+ { RTLIB::SDIV_I64, "__mspabi_divlli", ISD::SETCC_INVALID },
+ { RTLIB::UDIV_I16, "__mspabi_divu", ISD::SETCC_INVALID },
+ { RTLIB::UDIV_I32, "__mspabi_divul", ISD::SETCC_INVALID },
+ { RTLIB::UDIV_I64, "__mspabi_divull", ISD::SETCC_INVALID },
+ { RTLIB::SREM_I16, "__mspabi_remi", ISD::SETCC_INVALID },
+ { RTLIB::SREM_I32, "__mspabi_remli", ISD::SETCC_INVALID },
+ { RTLIB::SREM_I64, "__mspabi_remlli", ISD::SETCC_INVALID },
+ { RTLIB::UREM_I16, "__mspabi_remu", ISD::SETCC_INVALID },
+ { RTLIB::UREM_I32, "__mspabi_remul", ISD::SETCC_INVALID },
+ { RTLIB::UREM_I64, "__mspabi_remull", ISD::SETCC_INVALID },
+
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ if (LC.Cond != ISD::SETCC_INVALID)
+ setCmpLibcallCC(LC.Op, LC.Cond);
+ }
+
+ if (HWMultMode == HWMult16) {
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ } LibraryCalls[] = {
+ // Integer Multiply - EABI Table 9
+ { RTLIB::MUL_I16, "__mspabi_mpyi_hw" },
+ { RTLIB::MUL_I32, "__mspabi_mpyl_hw" },
+ { RTLIB::MUL_I64, "__mspabi_mpyll_hw" },
+ // TODO The __mspabi_mpysl*_hw functions ARE implemented in libgcc
+ // TODO The __mspabi_mpyul*_hw functions ARE implemented in libgcc
+ };
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ }
+ } else if (HWMultMode == HWMult32) {
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ } LibraryCalls[] = {
+ // Integer Multiply - EABI Table 9
+ { RTLIB::MUL_I16, "__mspabi_mpyi_hw" },
+ { RTLIB::MUL_I32, "__mspabi_mpyl_hw32" },
+ { RTLIB::MUL_I64, "__mspabi_mpyll_hw32" },
+ // TODO The __mspabi_mpysl*_hw32 functions ARE implemented in libgcc
+ // TODO The __mspabi_mpyul*_hw32 functions ARE implemented in libgcc
+ };
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ }
+ } else if (HWMultMode == HWMultF5) {
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ } LibraryCalls[] = {
+ // Integer Multiply - EABI Table 9
+ { RTLIB::MUL_I16, "__mspabi_mpyi_f5hw" },
+ { RTLIB::MUL_I32, "__mspabi_mpyl_f5hw" },
+ { RTLIB::MUL_I64, "__mspabi_mpyll_f5hw" },
+ // TODO The __mspabi_mpysl*_f5hw functions ARE implemented in libgcc
+ // TODO The __mspabi_mpyul*_f5hw functions ARE implemented in libgcc
+ };
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ }
+ } else { // NoHWMult
+ const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ } LibraryCalls[] = {
+ // Integer Multiply - EABI Table 9
+ { RTLIB::MUL_I16, "__mspabi_mpyi" },
+ { RTLIB::MUL_I32, "__mspabi_mpyl" },
+ { RTLIB::MUL_I64, "__mspabi_mpyll" },
+ // The __mspabi_mpysl* functions are NOT implemented in libgcc
+ // The __mspabi_mpyul* functions are NOT implemented in libgcc
+ };
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ }
+ setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::MSP430_BUILTIN);
}
+ // Several of the runtime library functions use a special calling conv
+ setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN);
+ setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN);
+ // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
+
setMinFunctionAlignment(1);
setPrefFunctionAlignment(2);
}
@@ -281,10 +452,27 @@ template<typename ArgT>
static void AnalyzeArguments(CCState &State,
SmallVectorImpl<CCValAssign> &ArgLocs,
const SmallVectorImpl<ArgT> &Args) {
- static const MCPhysReg RegList[] = {
+ static const MCPhysReg CRegList[] = {
MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
};
- static const unsigned NbRegs = array_lengthof(RegList);
+ static const unsigned CNbRegs = array_lengthof(CRegList);
+ static const MCPhysReg BuiltinRegList[] = {
+ MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+ MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
+ };
+ static const unsigned BuiltinNbRegs = array_lengthof(BuiltinRegList);
+
+ ArrayRef<MCPhysReg> RegList;
+ unsigned NbRegs;
+
+ bool Builtin = (State.getCallingConv() == CallingConv::MSP430_BUILTIN);
+ if (Builtin) {
+ RegList = BuiltinRegList;
+ NbRegs = BuiltinNbRegs;
+ } else {
+ RegList = CRegList;
+ NbRegs = CNbRegs;
+ }
if (State.isVarArg()) {
AnalyzeVarArgs(State, Args);
@@ -294,6 +482,11 @@ static void AnalyzeArguments(CCState &State,
SmallVector<unsigned, 4> ArgsParts;
ParseFunctionArgs(Args, ArgsParts);
+ if (Builtin) {
+ assert(ArgsParts.size() == 2 &&
+ "Builtin calling convention requires two arguments");
+ }
+
unsigned RegsLeft = NbRegs;
bool UsedStack = false;
unsigned ValNo = 0;
@@ -323,6 +516,11 @@ static void AnalyzeArguments(CCState &State,
unsigned Parts = ArgsParts[i];
+ if (Builtin) {
+ assert(Parts == 4 &&
+ "Builtin calling convention requires 64-bit arguments");
+ }
+
if (!UsedStack && Parts == 2 && RegsLeft == 1) {
// Special case for 32-bit register split, see EABI section 3.3.3
unsigned Reg = State.AllocateReg(RegList);
@@ -400,6 +598,7 @@ MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
switch (CallConv) {
default:
llvm_unreachable("Unsupported calling convention");
+ case CallingConv::MSP430_BUILTIN:
case CallingConv::Fast:
case CallingConv::C:
return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
@@ -598,7 +797,6 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
/// LowerCCCCallTo - functions arguments are copied from virtual regs to
/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
-// TODO: sret.
SDValue MSP430TargetLowering::LowerCCCCallTo(
SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -615,8 +813,7 @@ SDValue MSP430TargetLowering::LowerCCCCallTo(
unsigned NumBytes = CCInfo.getNextStackOffset();
auto PtrVT = getPointerTy(DAG.getDataLayout());
- Chain = DAG.getCALLSEQ_START(Chain,
- DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
SmallVector<SDValue, 12> MemOpChains;
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index e3259bd6a7bc..d81f17e753c5 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -85,6 +85,12 @@ public:
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;
+
+ int64_t getFramePoppedByCallee(const MachineInstr &I) const {
+ assert(isFrameInstr(I) && "Not a frame instruction");
+ assert(I.getOperand(1).getImm() >= 0 && "Size must not be negative");
+ return I.getOperand(1).getImm();
+ }
};
}
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 22fc2474fae6..1cd18611e52c 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -23,7 +23,8 @@ class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
// Type Profiles.
//===----------------------------------------------------------------------===//
def SDT_MSP430Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
-def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>,
+ SDTCisVT<1, i16>]>;
def SDT_MSP430CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
def SDT_MSP430Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisPtrTy<0>]>;
@@ -113,9 +114,9 @@ def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber SR.
let Defs = [SP, SR], Uses = [SP] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
"#ADJCALLSTACKDOWN",
- [(MSP430callseq_start timm:$amt)]>;
+ [(MSP430callseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
"#ADJCALLSTACKUP",
[(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
@@ -209,7 +210,7 @@ let isCall = 1 in
// a use to prevent stack-pointer assignments that appear immediately
// before calls from potentially appearing dead. Uses for argument
// registers are added manually.
- let Defs = [R12, R13, R14, R15, SR],
+ let Defs = [R11, R12, R13, R14, R15, SR],
Uses = [SP] in {
def CALLi : II16i<0x0,
(outs), (ins i16imm:$dst),
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 81cd9d1ad3f8..9600bc28f100 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -41,12 +41,12 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const Function* F = MF->getFunction();
static const MCPhysReg CalleeSavedRegs[] = {
MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
- MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+ MSP430::R8, MSP430::R9, MSP430::R10,
0
};
static const MCPhysReg CalleeSavedRegsFP[] = {
MSP430::R5, MSP430::R6, MSP430::R7,
- MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+ MSP430::R8, MSP430::R9, MSP430::R10,
0
};
static const MCPhysReg CalleeSavedRegsIntr[] = {
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 21c99da0922d..b83f44a74d5b 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1133,7 +1133,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
if (NumBytes < 16)
NumBytes = 16;
- emitInst(Mips::ADJCALLSTACKDOWN).addImm(16);
+ emitInst(Mips::ADJCALLSTACKDOWN).addImm(16).addImm(0);
// Process the args.
MVT firstMVT;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 8f39ebd42a5c..78bae6954c3c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -2787,7 +2787,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);
if (!IsTailCall)
- Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NextStackOffset, 0, DL);
SDValue StackPtr =
DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index b90077d7807d..8761946b8dbb 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -21,7 +21,7 @@ def SDT_MipsCMov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
SDTCisSameAs<1, 2>,
SDTCisSameAs<3, 4>,
SDTCisInt<4>]>;
-def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
def SDT_MipsCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
@@ -1719,8 +1719,8 @@ let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in {
}
let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
-def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
- [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
[(callseq_end timm:$amt1, timm:$amt2)]>;
}
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index 68dcbdfb4211..f8d9c34556bc 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -257,7 +257,7 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
// Get the instruction that loads the function address from the GOT.
Reg = MO->getReg();
- Val = (Value*)nullptr;
+ Val = nullptr;
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
MachineInstr *DefMI = MRI.getVRegDef(Reg);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 61fdda8aa109..ebaaf42bc64e 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1430,8 +1430,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
return Chain;
SDValue tempChain = Chain;
- Chain = DAG.getCALLSEQ_START(
- Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
SDValue InFlag = Chain.getValue(1);
unsigned paramCount = 0;
@@ -1549,7 +1548,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getMemIntrinsicNode(
Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
- TheStoreType, MachinePointerInfo(), EltAlign);
+ TheStoreType, MachinePointerInfo(), EltAlign,
+ /* Volatile */ false, /* ReadMem */ false,
+ /* WriteMem */ true, /* Size */ 0);
InFlag = Chain.getValue(1);
// Cleanup.
@@ -1609,7 +1610,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
theVal, InFlag };
Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
CopyParamOps, elemtype,
- MachinePointerInfo());
+ MachinePointerInfo(), /* Align */ 0,
+ /* Volatile */ false, /* ReadMem */ false,
+ /* WriteMem */ true, /* Size */ 0);
InFlag = Chain.getValue(1);
}
@@ -1795,7 +1798,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
SDValue RetVal = DAG.getMemIntrinsicNode(
Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
- MachinePointerInfo(), EltAlign);
+ MachinePointerInfo(), EltAlign, /* Volatile */ false,
+ /* ReadMem */ true, /* WriteMem */ false, /* Size */ 0);
for (unsigned j = 0; j < NumElts; ++j) {
SDValue Ret = RetVal.getValue(j);
@@ -2579,7 +2583,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
StoreOperands, TheStoreType,
- MachinePointerInfo(), 1);
+ MachinePointerInfo(), /* Align */ 1,
+ /* Volatile */ false, /* ReadMem */ false,
+ /* WriteMem */ true, /* Size */ 0);
// Cleanup vector state.
StoreOperands.clear();
}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 9378b29a9d0e..b5b5ea1ed639 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3101,7 +3101,8 @@ def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
(CBranchOther Int1Regs:$a, bb:$target)>;
// Call
-def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
@@ -3126,10 +3127,10 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
: NVPTXInst<outs, ins, asmstr, pattern>;
def Callseq_Start :
- NVPTXInst<(outs), (ins i32imm:$amt),
- "\\{ // callseq $amt\n"
+ NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "\\{ // callseq $amt1, $amt2\n"
"\t.reg .b32 temp_param_reg;",
- [(callseq_start timm:$amt)]>;
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
def Callseq_End :
NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
"\\} // callseq $amt1",
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 12ffbfdeacc1..11d22377611b 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -204,6 +204,17 @@ static const unsigned G8Regs[] = {
PPC::X28, PPC::X29, PPC::X30, PPC::X31
};
+static const unsigned G80Regs[] = {
+ PPC::ZERO8, PPC::X1, PPC::X2, PPC::X3,
+ PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+ PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+ PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+ PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+ PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+ PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+ PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+
static const unsigned QFRegs[] = {
PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3,
PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
@@ -301,6 +312,12 @@ static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
return decodeRegisterClass(Inst, RegNo, G8Regs);
}
+static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeRegisterClass(Inst, RegNo, G80Regs);
+}
+
#define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
#define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 609d959c6d08..84bb9ec56800 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -95,7 +95,8 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
return;
}
- if (MI->getOpcode() == PPC::RLDICR) {
+ if (MI->getOpcode() == PPC::RLDICR ||
+ MI->getOpcode() == PPC::RLDICR_32) {
unsigned char SH = MI->getOperand(2).getImm();
unsigned char ME = MI->getOperand(3).getImm();
// rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 9b91b9ab8f82..2fc8654deeab 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -1330,7 +1330,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
// Issue CALLSEQ_START.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TII.getCallFrameSetupOpcode()))
- .addImm(NumBytes);
+ .addImm(NumBytes).addImm(0);
// Prepare to assign register arguments. Every argument uses up a
// GPR protocol register even if it's passed in a floating-point
@@ -2246,6 +2246,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
}
case PPC::EXTSW:
+ case PPC::EXTSW_32:
case PPC::EXTSW_32_64: {
if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
return false;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 1b0402bf003d..5fa7b2c6bfb1 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -54,6 +54,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -68,6 +69,14 @@ using namespace llvm;
#define DEBUG_TYPE "ppc-codegen"
+STATISTIC(NumSextSetcc,
+ "Number of (sext(setcc)) nodes expanded into GPR sequence.");
+STATISTIC(NumZextSetcc,
+ "Number of (zext(setcc)) nodes expanded into GPR sequence.");
+STATISTIC(SignExtensionsAdded,
+ "Number of sign extensions for compare inputs added.");
+STATISTIC(ZeroExtensionsAdded,
+ "Number of zero extensions for compare inputs added.");
// FIXME: Remove this once the bug has been fixed!
cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
@@ -252,7 +261,28 @@ namespace {
#include "PPCGenDAGISel.inc"
private:
+ // Conversion type for interpreting results of a 32-bit instruction as
+ // a 64-bit value or vice versa.
+ enum ExtOrTruncConversion { Ext, Trunc };
+
+ // Modifiers to guide how an ISD::SETCC node's result is to be computed
+ // in a GPR.
+ // ZExtOrig - use the original condition code, zero-extend value
+ // ZExtInvert - invert the condition code, zero-extend value
+ // SExtOrig - use the original condition code, sign-extend value
+ // SExtInvert - invert the condition code, sign-extend value
+ enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert };
+
bool trySETCC(SDNode *N);
+ bool tryEXTEND(SDNode *N);
+ SDValue signExtendInputIfNeeded(SDValue Input);
+ SDValue zeroExtendInputIfNeeded(SDValue Input);
+ SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv);
+ SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl);
+ SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl);
+ SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts);
void PeepholePPC64();
void PeepholePPC64ZExt();
@@ -2471,6 +2501,225 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
return true;
}
+/// If this node is a sign/zero extension of an integer comparison,
+/// it can usually be computed in GPR's rather than using comparison
+/// instructions and ISEL. We only do this on 64-bit targets for now
+/// as the code is specialized for 64-bit (it uses 64-bit instructions
+/// and assumes 64-bit registers).
+bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) {
+ if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
+ return false;
+ assert((N->getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOpcode() == ISD::SIGN_EXTEND) &&
+ "Expecting a zero/sign extend node!");
+
+ if (N->getOperand(0).getOpcode() != ISD::SETCC)
+ return false;
+
+ SDValue WideRes =
+ getSETCCInGPR(N->getOperand(0),
+ N->getOpcode() == ISD::SIGN_EXTEND ?
+ SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
+
+ if (!WideRes)
+ return false;
+
+ SDLoc dl(N);
+ bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32;
+ bool Output32Bit = N->getValueType(0) == MVT::i32;
+
+ NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0;
+ NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1;
+
+ SDValue ConvOp = WideRes;
+ if (Inputs32Bit != Output32Bit)
+ ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext :
+ ExtOrTruncConversion::Trunc);
+ ReplaceNode(N, ConvOp.getNode());
+
+ return true;
+}
+
+/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) {
+ assert(Input.getValueType() == MVT::i32 &&
+ "Can only sign-extend 32-bit values here.");
+ unsigned Opc = Input.getOpcode();
+
+ // The value was sign extended and then truncated to 32-bits. No need to
+ // sign extend it again.
+ if (Opc == ISD::TRUNCATE &&
+ (Input.getOperand(0).getOpcode() == ISD::AssertSext ||
+ Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND))
+ return Input;
+
+ LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+ // The input is a sign-extending load. No reason to sign-extend.
+ if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD)
+ return Input;
+
+ ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+ // We don't sign-extend constants and already sign-extended values.
+ if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG ||
+ Opc == ISD::SIGN_EXTEND)
+ return Input;
+
+ SDLoc dl(Input);
+ SignExtensionsAdded++;
+ return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0);
+}
+
+/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) {
+ assert(Input.getValueType() == MVT::i32 &&
+ "Can only zero-extend 32-bit values here.");
+ LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+ unsigned Opc = Input.getOpcode();
+
+ // No need to zero-extend loaded values (unless they're loaded with
+ // a sign-extending load).
+ if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD)
+ return Input;
+
+ ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+ bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0;
+ // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have
+ // to conservatively actually clear the high bits. We also don't need to
+ // zero-extend constants or values that are already zero-extended.
+ if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND)
+ return Input;
+
+ SDLoc dl(Input);
+ ZeroExtensionsAdded++;
+ return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input,
+ getI64Imm(0, dl), getI64Imm(32, dl)),
+ 0);
+}
+
+// Handle a 32-bit value in a 64-bit register and vice-versa. These are of
+// course not actual zero/sign extensions that will generate machine code,
+// they're just a way to reinterpret a 32 bit value in a register as a
+// 64 bit value and vice-versa.
+SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes,
+ ExtOrTruncConversion Conv) {
+ SDLoc dl(NatWidthRes);
+
+ // For reinterpreting 32-bit values as 64 bit values, we generate
+ // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1>
+ if (Conv == ExtOrTruncConversion::Ext) {
+ SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0);
+ SDValue SubRegIdx =
+ CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+ return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64,
+ ImDef, NatWidthRes, SubRegIdx), 0);
+ }
+
+ assert(Conv == ExtOrTruncConversion::Trunc &&
+ "Unknown convertion between 32 and 64 bit values.");
+ // For reinterpreting 64-bit values as 32-bit values, we just need to
+ // EXTRACT_SUBREG (i.e. extract the low word).
+ SDValue SubRegIdx =
+ CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+ return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32,
+ NatWidthRes, SubRegIdx), 0);
+}
+
+/// Produces a zero-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl) {
+ bool IsRHSZero = RHSValue == 0;
+ switch (CC) {
+ default: return SDValue();
+ case ISD::SETEQ: {
+ // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5)
+ // (zext (setcc %a, 0, seteq)) -> (lshr (cntlzw %a), 5)
+ SDValue Xor = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+ SDValue Clz =
+ SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+ SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl),
+ getI32Imm(31, dl) };
+ return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+ ShiftOps), 0);
+ }
+ }
+}
+
+/// Produces a sign-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS,
+ ISD::CondCode CC,
+ int64_t RHSValue, SDLoc dl) {
+ bool IsRHSZero = RHSValue == 0;
+ switch (CC) {
+ default: return SDValue();
+ case ISD::SETEQ: {
+ // (sext (setcc %a, %b, seteq)) ->
+ // (ashr (shl (ctlz (xor %a, %b)), 58), 63)
+ // (sext (setcc %a, 0, seteq)) ->
+ // (ashr (shl (ctlz %a), 58), 63)
+ SDValue CountInput = IsRHSZero ? LHS :
+ SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+ SDValue Cntlzw =
+ SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0);
+ SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) };
+ SDValue Sldi =
+ SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0);
+ return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi,
+ getI32Imm(63, dl)), 0);
+ }
+ }
+}
+
+/// Returns an equivalent of a SETCC node but with the result the same width as
+/// the inputs. This can nalso be used for SELECT_CC if either the true or false
+/// values is a power of two while the other is zero.
+SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
+ SetccInGPROpts ConvOpts) {
+ assert((Compare.getOpcode() == ISD::SETCC ||
+ Compare.getOpcode() == ISD::SELECT_CC) &&
+ "An ISD::SETCC node required here.");
+
+ SDValue LHS = Compare.getOperand(0);
+ SDValue RHS = Compare.getOperand(1);
+
+ // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC.
+ int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2;
+ ISD::CondCode CC =
+ cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get();
+ EVT InputVT = LHS.getValueType();
+ if (InputVT != MVT::i32)
+ return SDValue();
+
+ SDLoc dl(Compare);
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
+
+ if (ConvOpts == SetccInGPROpts::ZExtInvert ||
+ ConvOpts == SetccInGPROpts::SExtInvert)
+ CC = ISD::getSetCCInverse(CC, true);
+
+ if (ISD::isSignedIntSetCC(CC)) {
+ LHS = signExtendInputIfNeeded(LHS);
+ RHS = signExtendInputIfNeeded(RHS);
+ } else if (ISD::isUnsignedIntSetCC(CC)) {
+ LHS = zeroExtendInputIfNeeded(LHS);
+ RHS = zeroExtendInputIfNeeded(RHS);
+ }
+
+ bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig ||
+ ConvOpts == SetccInGPROpts::SExtInvert;
+ if (IsSext)
+ return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+ return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+}
+
void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
// Transfer memoperands.
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -2508,6 +2757,12 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
}
break;
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ if (tryEXTEND(N))
+ return;
+ break;
+
case ISD::SETCC:
if (trySETCC(N))
return;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 685f24cb502e..17bdd595da10 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -923,6 +923,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
// We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::BUILD_VECTOR);
if (Subtarget.hasFPCVT())
@@ -4949,8 +4952,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
- dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue CallSeqStart = Chain;
// Load the return address and frame pointer so it can be moved somewhere else
@@ -5000,9 +5002,8 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
Flags, DAG, dl);
// This must go outside the CALLSEQ_START..END.
- SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
- CallSeqStart.getNode()->getOperand(1),
- SDLoc(MemcpyCall));
+ SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
+ SDLoc(MemcpyCall));
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
NewCallSeqStart.getNode());
Chain = CallSeqStart = NewCallSeqStart;
@@ -5083,9 +5084,9 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
CallSeqStart.getNode()->getOperand(0),
Flags, DAG, dl);
// The MEMCPY must go outside the CALLSEQ_START..END.
- SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
- CallSeqStart.getNode()->getOperand(1),
- SDLoc(MemcpyCall));
+ int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
+ SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
+ SDLoc(MemcpyCall));
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
NewCallSeqStart.getNode());
return NewCallSeqStart;
@@ -5268,8 +5269,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall)
- Chain = DAG.getCALLSEQ_START(Chain,
- DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue CallSeqStart = Chain;
// Load the return address and frame pointer so it can be move somewhere else
@@ -5828,8 +5828,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
- dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SDValue CallSeqStart = Chain;
// Load the return address and frame pointer so it can be move somewhere else
@@ -8741,9 +8740,9 @@ static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
// The mappings for emitLeading/TrailingFence is taken from
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
- AtomicOrdering Ord, bool IsStore,
- bool IsLoad) const {
+Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -8751,10 +8750,10 @@ Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
return nullptr;
}
-Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
- AtomicOrdering Ord, bool IsStore,
- bool IsLoad) const {
- if (IsLoad && isAcquireOrStronger(Ord))
+Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord))
return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
// FIXME: this is too conservative, a dependent branch + isync is enough.
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
@@ -11316,6 +11315,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
SDLoc dl(N);
switch (N->getOpcode()) {
default: break;
+ case ISD::SHL:
+ return combineSHL(N, DCI);
+ case ISD::SRA:
+ return combineSRA(N, DCI);
+ case ISD::SRL:
+ return combineSRL(N, DCI);
case PPCISD::SHL:
if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
return N->getOperand(0);
@@ -12948,3 +12953,58 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
return Imm.isPosZero();
}
}
+
+// For vector shift operation op, fold
+// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
+static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
+ SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N0.getValueType();
+ unsigned OpSizeInBits = VT.getScalarSizeInBits();
+ unsigned Opcode = N->getOpcode();
+ unsigned TargetOpcode;
+
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected shift operation");
+ case ISD::SHL:
+ TargetOpcode = PPCISD::SHL;
+ break;
+ case ISD::SRL:
+ TargetOpcode = PPCISD::SRL;
+ break;
+ case ISD::SRA:
+ TargetOpcode = PPCISD::SRA;
+ break;
+ }
+
+ if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
+ N1->getOpcode() == ISD::AND)
+ if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
+ if (Mask->getZExtValue() == OpSizeInBits - 1)
+ return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));
+
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
+ if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+ return Value;
+
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
+ if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+ return Value;
+
+ return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
+ if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+ return Value;
+
+ return SDValue();
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 32661099b79d..4fc744257262 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -117,9 +117,13 @@ namespace llvm {
/// at function entry, used for PIC code.
GlobalBaseReg,
- /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
- /// shift amounts. These nodes are generated by the multi-precision shift
- /// code.
+ /// These nodes represent PPC shifts.
+ ///
+ /// For scalar types, only the last `n + 1` bits of the shift amounts
+ /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
+ /// for exact behaviors.
+ ///
+ /// For vector types, only the last n bits are used. See vsld.
SRL, SRA, SHL,
/// The combination of sra[wd]i and addze used to implemented signed
@@ -617,10 +621,10 @@ namespace llvm {
return true;
}
- Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
- bool IsStore, bool IsLoad) const override;
- Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
- bool IsStore, bool IsLoad) const override;
+ Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
@@ -999,6 +1003,9 @@ namespace llvm {
SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
/// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
/// SETCC with integer subtraction when (1) there is a legal way of doing it
@@ -1017,14 +1024,6 @@ namespace llvm {
SDValue
combineElementTruncationToVectorTruncation(SDNode *N,
DAGCombinerInfo &DCI) const;
-
- bool supportsModuloShift(ISD::NodeType Inst,
- EVT ReturnType) const override {
- assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) &&
- "Expect a shift instruction");
- assert(isOperationLegal(Inst, ReturnType));
- return ReturnType.isVector();
- }
};
namespace PPC {
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 997b96ca6ec8..a8433919f0f3 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -634,10 +634,19 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
"extsw", "$rA, $rS", IIC_IntSimple,
[(set i64:$rA, (sext i32:$rS))]>, isPPC64;
+let isCodeGenOnly = 1 in
+def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS),
+ "extsw $rA, $rS", IIC_IntSimple,
+ []>, isPPC64;
defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
"sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
[(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def SRADI_32 : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH),
+ "sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64;
+
defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS),
"cntlzd", "$rA, $rS", IIC_IntGeneral,
[(set i64:$rA, (ctlz i64:$rS))]>;
@@ -721,15 +730,26 @@ defm RLDICL : MDForm_1r<30, 0,
// For fast-isel:
let isCodeGenOnly = 1 in
def RLDICL_32_64 : MDForm_1<30, 0,
- (outs g8rc:$rA),
- (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
- "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
- []>, isPPC64;
+ (outs g8rc:$rA),
+ (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64;
// End fast-isel.
+let isCodeGenOnly = 1 in
+def RLDICL_32 : MDForm_1<30, 0,
+ (outs gprc:$rA),
+ (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64;
defm RLDICR : MDForm_1r<30, 1,
(outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
"rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
[]>, isPPC64;
+let isCodeGenOnly = 1 in
+def RLDICR_32 : MDForm_1<30, 1,
+ (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+ "rldicr $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+ []>, isPPC64;
defm RLDIC : MDForm_1r<30, 2,
(outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
"rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index c380766e9f5c..e14d18fd5433 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -987,6 +987,12 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
(v8i16 (VSLH $vA, $vB))>;
def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
(v4i32 (VSLW $vA, $vB))>;
+def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)),
+ (v16i8 (VSLB $vA, $vB))>;
+def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
+ (v8i16 (VSLH $vA, $vB))>;
+def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VSLW $vA, $vB))>;
def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
(v16i8 (VSRB $vA, $vB))>;
@@ -994,6 +1000,12 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
(v8i16 (VSRH $vA, $vB))>;
def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
(v4i32 (VSRW $vA, $vB))>;
+def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)),
+ (v16i8 (VSRB $vA, $vB))>;
+def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
+ (v8i16 (VSRH $vA, $vB))>;
+def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VSRW $vA, $vB))>;
def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
(v16i8 (VSRAB $vA, $vB))>;
@@ -1001,6 +1013,12 @@ def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)),
(v8i16 (VSRAH $vA, $vB))>;
def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)),
(v4i32 (VSRAW $vA, $vB))>;
+def : Pat<(v16i8 (PPCsra v16i8:$vA, v16i8:$vB)),
+ (v16i8 (VSRAB $vA, $vB))>;
+def : Pat<(v8i16 (PPCsra v8i16:$vA, v8i16:$vB)),
+ (v8i16 (VSRAH $vA, $vB))>;
+def : Pat<(v4i32 (PPCsra v4i32:$vA, v4i32:$vB)),
+ (v4i32 (VSRAW $vA, $vB))>;
// Float to integer and integer to float conversions
def : Pat<(v4i32 (fp_to_sint v4f32:$vA)),
@@ -1072,14 +1090,24 @@ def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB),
// Vector shifts
def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>;
def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
- "vsld $vD, $vA, $vB", IIC_VecGeneral,
- [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>;
+ "vsld $vD, $vA, $vB", IIC_VecGeneral, []>;
def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
- "vsrd $vD, $vA, $vB", IIC_VecGeneral,
- [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>;
+ "vsrd $vD, $vA, $vB", IIC_VecGeneral, []>;
def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
- "vsrad $vD, $vA, $vB", IIC_VecGeneral,
- [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>;
+ "vsrad $vD, $vA, $vB", IIC_VecGeneral, []>;
+
+def : Pat<(v2i64 (shl v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSLD $vA, $vB))>;
+def : Pat<(v2i64 (PPCshl v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSLD $vA, $vB))>;
+def : Pat<(v2i64 (srl v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSRD $vA, $vB))>;
+def : Pat<(v2i64 (PPCsrl v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSRD $vA, $vB))>;
+def : Pat<(v2i64 (sra v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSRAD $vA, $vB))>;
+def : Pat<(v2i64 (PPCsra v2i64:$vA, v2i64:$vB)),
+ (v2i64 (VSRAD $vA, $vB))>;
// Vector Integer Arithmetic Instructions
let isCommutable = 1 in {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index f004ce49cac0..1af5e7f28342 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -33,7 +33,8 @@ def SDT_PPCVexts : SDTypeProfile<1, 2, [
SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
]>;
-def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
def SDT_PPCCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
SDTCisVT<1, i32> ]>;
def SDT_PPCvperm : SDTypeProfile<1, 3, [
@@ -1099,9 +1100,11 @@ multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
let hasCtrlDep = 1 in {
let Defs = [R1], Uses = [R1] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
- [(callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+ "#ADJCALLSTACKDOWN $amt1 $amt2",
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+ "#ADJCALLSTACKUP $amt1 $amt2",
[(callseq_end timm:$amt1, timm:$amt2)]>;
}
@@ -4163,6 +4166,8 @@ def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0
def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"clrldi $rA, $rS, $n",
+ (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>;
def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 967557452f24..b98140fedfc0 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1436,7 +1436,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in {
def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
"mtvsrws $XT, $rA", IIC_VecGeneral, []>;
- def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+ def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB),
"mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
[]>, Requires<[In64BitMode]>;
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 0c1260a2965b..c7aa4cb78b7a 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -99,7 +99,8 @@ protected:
// Don't really need to save data to the stack - the clobbered
// registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
// gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
- BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0);
+ BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
+ .addImm(0);
// Expand into two ops built prior to the existing instruction.
MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index acb34d5baaa8..9e7e3c6b705a 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -773,8 +773,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
}
}
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
- dl);
+ Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl);
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
@@ -1165,8 +1164,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
// Adjust the stack pointer to make room for the arguments.
// FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
// with more than 6 arguments.
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
// Collect the set of registers to pass to the function and their values.
// This will be emitted as a sequence of CopyToReg nodes glued to the call
@@ -2058,7 +2056,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
SDValue Chain = DAG.getEntryNode();
SDValue InFlag;
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, DL, true), DL);
+ Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
InFlag = Chain.getValue(1);
SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -3386,7 +3384,10 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
default: break;
- case 'r': return C_RegisterClass;
+ case 'r':
+ case 'f':
+ case 'e':
+ return C_RegisterClass;
case 'I': // SIMM13
return C_Other;
}
@@ -3465,6 +3466,24 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &SP::IntPairRegClass);
else
return std::make_pair(0U, &SP::IntRegsRegClass);
+ case 'f':
+ if (VT == MVT::f32)
+ return std::make_pair(0U, &SP::FPRegsRegClass);
+ else if (VT == MVT::f64)
+ return std::make_pair(0U, &SP::LowDFPRegsRegClass);
+ else if (VT == MVT::f128)
+ return std::make_pair(0U, &SP::LowQFPRegsRegClass);
+ llvm_unreachable("Unknown ValueType for f-register-type!");
+ break;
+ case 'e':
+ if (VT == MVT::f32)
+ return std::make_pair(0U, &SP::FPRegsRegClass);
+ else if (VT == MVT::f64)
+ return std::make_pair(0U, &SP::DFPRegsRegClass);
+ else if (VT == MVT::f128)
+ return std::make_pair(0U, &SP::QFPRegsRegClass);
+ llvm_unreachable("Unknown ValueType for e-register-type!");
+ break;
}
} else if (!Constraint.empty() && Constraint.size() <= 5
&& Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 5a19c624abb5..ae45c8be6752 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -195,7 +195,8 @@ def SPsjlj_longjmp: SDNode<"SPISD::EH_SJLJ_LONGJMP",
[SDNPHasChain, SDNPSideEffect]>;
// These are target-independent nodes, but have target-specific formats.
-def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
SDTCisVT<1, i32> ]>;
@@ -404,9 +405,9 @@ let Defs = [O7] in {
}
let Defs = [O6], Uses = [O6] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
- "!ADJCALLSTACKDOWN $amt",
- [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "!ADJCALLSTACKDOWN $amt1, $amt2",
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
"!ADJCALLSTACKUP $amt1",
[(callseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index 6ecfddfc7d66..6625eaafd992 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -346,11 +346,13 @@ def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
// Floating point register classes.
def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
-
def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
-
def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
+// The Low?FPRegs classes are used only for inline-asm constraints.
+def LowDFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>;
+def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>;
+
// Floating point control register classes.
def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 3f91ca9035a6..efcf6696fd50 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -262,6 +262,9 @@ public:
bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const {
return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287);
}
+ bool isMemDisp12Len4(RegisterKind RegKind) const {
+ return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x10);
+ }
bool isMemDisp12Len8(RegisterKind RegKind) const {
return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100);
}
@@ -347,6 +350,7 @@ public:
bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); }
bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); }
bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); }
+ bool isBDLAddr64Disp12Len4() const { return isMemDisp12Len4(ADDR64Reg); }
bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, ADDR64Reg); }
bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); }
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index a281a0aa6bcc..27fd70bc6092 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -327,6 +327,18 @@ static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field,
return MCDisassembler::Success;
}
+static DecodeStatus decodeBDLAddr12Len4Operand(MCInst &Inst, uint64_t Field,
+ const unsigned *Regs) {
+ uint64_t Length = Field >> 16;
+ uint64_t Base = (Field >> 12) & 0xf;
+ uint64_t Disp = Field & 0xfff;
+ assert(Length < 16 && "Invalid BDLAddr12Len4");
+ Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+ Inst.addOperand(MCOperand::createImm(Disp));
+ Inst.addOperand(MCOperand::createImm(Length + 1));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field,
const unsigned *Regs) {
uint64_t Length = Field >> 16;
@@ -399,6 +411,13 @@ static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
}
+static DecodeStatus decodeBDLAddr64Disp12Len4Operand(MCInst &Inst,
+ uint64_t Field,
+ uint64_t Address,
+ const void *Decoder) {
+ return decodeBDLAddr12Len4Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst,
uint64_t Field,
uint64_t Address,
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index 092eb4011adc..d188f56512ab 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -77,6 +77,9 @@ private:
uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ uint64_t getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
@@ -220,6 +223,17 @@ getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
}
uint64_t SystemZMCCodeEmitter::
+getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+ uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+ uint64_t Len = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
+ assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len));
+ return (Len << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index 86a1322c9e23..74cf653b9d95 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -63,7 +63,7 @@ via a register.)
--
-We don't use ICM or STCM.
+We don't use ICM, STCM, or CLM.
--
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index 716e5add8051..7bfa378aa85c 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -68,6 +68,11 @@ def FeaturePopulationCount : SystemZFeature<
"Assume that the population-count facility is installed"
>;
+def FeatureMessageSecurityAssist4 : SystemZFeature<
+ "message-security-assist-extension4", "MessageSecurityAssist4",
+ "Assume that the message-security-assist extension facility 4 is installed"
+>;
+
def Arch9NewFeatures : SystemZFeatureList<[
FeatureDistinctOps,
FeatureFastSerialization,
@@ -75,7 +80,8 @@ def Arch9NewFeatures : SystemZFeatureList<[
FeatureHighWord,
FeatureInterlockedAccess1,
FeatureLoadStoreOnCond,
- FeaturePopulationCount
+ FeaturePopulationCount,
+ FeatureMessageSecurityAssist4
]>;
//===----------------------------------------------------------------------===//
@@ -133,6 +139,11 @@ def FeatureLoadStoreOnCond2 : SystemZFeature<
"Assume that the load/store-on-condition facility 2 is installed"
>;
+def FeatureMessageSecurityAssist5 : SystemZFeature<
+ "message-security-assist-extension5", "MessageSecurityAssist5",
+ "Assume that the message-security-assist extension facility 5 is installed"
+>;
+
def FeatureVector : SystemZFeature<
"vector", "Vector",
"Assume that the vectory facility is installed"
@@ -142,6 +153,7 @@ def FeatureNoVector : SystemZMissingFeature<"Vector">;
def Arch11NewFeatures : SystemZFeatureList<[
FeatureLoadAndZeroRightmostByte,
FeatureLoadStoreOnCond2,
+ FeatureMessageSecurityAssist5,
FeatureVector
]>;
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 6989aabb8c6a..235e095f0010 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1110,9 +1110,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Mark the start of the call.
if (!IsTailCall)
- Chain = DAG.getCALLSEQ_START(Chain,
- DAG.getConstant(NumBytes, DL, PtrVT, true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
// Copy argument values to their designated locations.
SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
@@ -6354,3 +6352,12 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
llvm_unreachable("Unexpected instr type to insert");
}
}
+
+// This is only used by the isel schedulers, and is needed only to prevent
+// compiler from crashing when list-ilp is used.
+const TargetRegisterClass *
+SystemZTargetLowering::getRepRegClassFor(MVT VT) const {
+ if (VT == MVT::Untyped)
+ return &SystemZ::ADDR128BitRegClass;
+ return TargetLowering::getRepRegClassFor(VT);
+}
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 1c34dc43e8bb..79c8c4d92669 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -590,6 +590,8 @@ private:
MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI,
MachineBasicBlock *MBB,
unsigned Opcode) const;
+
+ const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
};
} // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index bb6d27e24828..364b81f98eed 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -458,6 +458,12 @@ def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>;
def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>;
def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
+// Divide to integer.
+let Defs = [CC] in {
+ def DIEBR : TernaryRRFb<"diebr", 0xB353, FP32, FP32, FP32>;
+ def DIDBR : TernaryRRFb<"didbr", 0xB35B, FP64, FP64, FP64>;
+}
+
//===----------------------------------------------------------------------===//
// Comparisons
//===----------------------------------------------------------------------===//
@@ -469,6 +475,13 @@ let Defs = [CC], CCValues = 0xF in {
def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
+
+ def KEBR : CompareRRE<"kebr", 0xB308, null_frag, FP32, FP32>;
+ def KDBR : CompareRRE<"kdbr", 0xB318, null_frag, FP64, FP64>;
+ def KXBR : CompareRRE<"kxbr", 0xB348, null_frag, FP128, FP128>;
+
+ def KEB : CompareRXE<"keb", 0xED08, null_frag, FP32, load, 4>;
+ def KDB : CompareRXE<"kdb", 0xED18, null_frag, FP64, load, 8>;
}
// Test Data Class.
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index c727f486087e..a37da2807854 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -710,6 +710,21 @@ class InstRSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{15-0} = RI2;
}
+class InstRSLa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<20> BDL1;
+
+ let Inst{47-40} = op{15-8};
+ let Inst{39-36} = BDL1{19-16};
+ let Inst{35-32} = 0;
+ let Inst{31-16} = BDL1{15-0};
+ let Inst{15-8} = 0;
+ let Inst{7-0} = op{7-0};
+}
+
class InstRSYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
: InstSystemZ<6, outs, ins, asmstr, pattern> {
field bits<48> Inst;
@@ -817,6 +832,37 @@ class InstSSa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{15-0} = BD2;
}
+class InstSSb<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<20> BDL1;
+ bits<20> BDL2;
+
+ let Inst{47-40} = op;
+ let Inst{39-36} = BDL1{19-16};
+ let Inst{35-32} = BDL2{19-16};
+ let Inst{31-16} = BDL1{15-0};
+ let Inst{15-0} = BDL2{15-0};
+}
+
+class InstSSc<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<20> BDL1;
+ bits<16> BD2;
+ bits<4> I3;
+
+ let Inst{47-40} = op;
+ let Inst{39-36} = BDL1{19-16};
+ let Inst{35-32} = I3;
+ let Inst{31-16} = BDL1{15-0};
+ let Inst{15-0} = BD2;
+}
+
class InstSSd<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
: InstSystemZ<6, outs, ins, asmstr, pattern> {
field bits<48> Inst;
@@ -850,6 +896,20 @@ class InstSSe<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
let Inst{15-0} = BD4;
}
+class InstSSf<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+ : InstSystemZ<6, outs, ins, asmstr, pattern> {
+ field bits<48> Inst;
+ field bits<48> SoftFail = 0;
+
+ bits<16> BD1;
+ bits<24> BDL2;
+
+ let Inst{47-40} = op;
+ let Inst{39-32} = BDL2{23-16};
+ let Inst{31-16} = BD1;
+ let Inst{15-0} = BDL2{15-0};
+}
+
class InstSSE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
: InstSystemZ<6, outs, ins, asmstr, pattern> {
field bits<48> Inst;
@@ -1567,6 +1627,9 @@ class ICV<string name>
// Inherent:
// One register output operand and no input operands.
//
+// InherentDual:
+// Two register output operands and no input operands.
+//
// StoreInherent:
// One address operand. The instruction stores to the address.
//
@@ -1642,8 +1705,9 @@ class ICV<string name>
// Two input operands and an implicit CC output operand.
//
// Test:
-// Two input operands and an implicit CC output operand. The second
-// input operand is an "address" operand used as a test class mask.
+// One or two input operands and an implicit CC output operand. If
+// present, the second input operand is an "address" operand used as
+// a test class mask.
//
// Ternary:
// One register output operand and three input operands.
@@ -1691,6 +1755,10 @@ class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
let R2 = 0;
}
+class InherentDualRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
+ : InstRRE<opcode, (outs cls:$R1, cls:$R2), (ins),
+ mnemonic#"\t$R1, $R2", []>;
+
class InherentVRIa<string mnemonic, bits<16> opcode, bits<16> value>
: InstVRIa<opcode, (outs VR128:$V1), (ins), mnemonic#"\t$V1", []> {
let I2 = value;
@@ -1714,6 +1782,12 @@ class SideEffectInherentS<string mnemonic, bits<16> opcode,
let BD2 = 0;
}
+class SideEffectInherentRRE<string mnemonic, bits<16> opcode>
+ : InstRRE<opcode, (outs), (ins), mnemonic, []> {
+ let R1 = 0;
+ let R2 = 0;
+}
+
// Allow an optional TLS marker symbol to generate TLS call relocations.
class CallRI<string mnemonic, bits<12> opcode>
: InstRIb<opcode, (outs), (ins GR64:$R1, brtarget16tls:$RI2),
@@ -2084,6 +2158,13 @@ multiclass LoadMultipleRSPair<string mnemonic, bits<8> rsOpcode,
}
}
+class LoadMultipleSSe<string mnemonic, bits<8> opcode, RegisterOperand cls>
+ : InstSSe<opcode, (outs cls:$R1, cls:$R3),
+ (ins bdaddr12only:$BD2, bdaddr12only:$BD4),
+ mnemonic#"\t$R1, $R3, $BD2, $BD4", []> {
+ let mayLoad = 1;
+}
+
class LoadMultipleVRSa<string mnemonic, bits<16> opcode>
: InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2),
mnemonic#"\t$V1, $V3, $BD2", []> {
@@ -2355,6 +2436,15 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let OpType = "reg";
}
+class UnaryMemRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+ let M3 = 0;
+}
+
class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
RegisterOperand cls, Immediate imm>
: InstRIa<opcode, (outs cls:$R1), (ins imm:$I2),
@@ -2585,11 +2675,61 @@ class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
: InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
mnemonic#"\t$I1, $I2", []>;
+class SideEffectBinarySI<string mnemonic, bits<8> opcode, Operand imm>
+ : InstSI<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+ mnemonic#"\t$BD1, $I2", []>;
+
class SideEffectBinarySIL<string mnemonic, bits<16> opcode,
SDPatternOperator operator, Immediate imm>
: InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>;
+class SideEffectBinarySSa<string mnemonic, bits<8> opcode>
+ : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1, bdaddr12only:$BD2),
+ mnemonic##"\t$BDL1, $BD2", []>;
+
+class SideEffectBinarySSb<string mnemonic, bits<8> opcode>
+ : InstSSb<opcode,
+ (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
+ mnemonic##"\t$BDL1, $BDL2", []>;
+
+class SideEffectBinarySSf<string mnemonic, bits<8> opcode>
+ : InstSSf<opcode, (outs), (ins bdaddr12only:$BD1, bdladdr12onlylen8:$BDL2),
+ mnemonic##"\t$BD1, $BDL2", []>;
+
+class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src";
+ let DisableEncoding = "$R1src, $R2src";
+}
+
+class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRE<opcode, (outs cls2:$R2), (ins cls1:$R1, cls2:$R2src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Constraints = "$R2 = $R2src";
+ let DisableEncoding = "$R2src";
+}
+
+class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRE<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src";
+ let DisableEncoding = "$R1src, $R2src";
+}
+
+class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+ mnemonic#"\t$R1, $R2", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src";
+ let DisableEncoding = "$R1src, $R2src";
+ let M3 = 0;
+}
+
class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
RegisterOperand cls1, RegisterOperand cls2>
: InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
@@ -2654,6 +2794,20 @@ class BinaryRRFb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let M4 = 0;
}
+class BinaryMemRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2, Immediate imm>
+ : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3),
+ mnemonic#"\t$R1, $R2, $M3", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
+multiclass BinaryMemRRFcOpt<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2> {
+ def "" : BinaryMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+ def Opt : UnaryMemRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
class BinaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
RegisterOperand cls2>
: InstRRFe<opcode, (outs cls1:$R1), (ins imm32zx4:$M3, cls2:$R2),
@@ -3112,6 +3266,34 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let AccessBytes = bytes;
}
+class StoreBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr12only>
+ : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+ mnemonic#"\t$R1, $M3, $BD2", []> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+class StoreBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+ mnemonic#"\t$R1, $M3, $BD2", []> {
+ let mayStore = 1;
+ let AccessBytes = bytes;
+}
+
+multiclass StoreBinaryRSPair<string mnemonic, bits<8> rsOpcode,
+ bits<16> rsyOpcode, RegisterOperand cls,
+ bits<5> bytes> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : StoreBinaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+ let DispSize = "20" in
+ def Y : StoreBinaryRSY<mnemonic#"y", rsyOpcode, cls, bytes,
+ bdaddr20pair>;
+ }
+}
+
class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
Immediate index>
: InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
@@ -3237,6 +3419,40 @@ multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
}
}
+class CompareRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr12only>
+ : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+ mnemonic#"\t$R1, $M3, $BD2", []> {
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+class CompareRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+ bits<5> bytes, AddressingMode mode = bdaddr20only>
+ : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+ mnemonic#"\t$R1, $M3, $BD2", []> {
+ let mayLoad = 1;
+ let AccessBytes = bytes;
+}
+
+multiclass CompareRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+ RegisterOperand cls, bits<5> bytes> {
+ let DispKey = mnemonic ## #cls in {
+ let DispSize = "12" in
+ def "" : CompareRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+ let DispSize = "20" in
+ def Y : CompareRSY<mnemonic#"y", rsyOpcode, cls, bytes, bdaddr20pair>;
+ }
+}
+
+class CompareSSb<string mnemonic, bits<8> opcode>
+ : InstSSb<opcode,
+ (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
+ mnemonic##"\t$BDL1, $BDL2", []> {
+ let isCompare = 1;
+ let mayLoad = 1;
+}
+
class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
SDPatternOperator load, Immediate imm,
AddressingMode mode = bdaddr12only>
@@ -3313,18 +3529,68 @@ class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
let M3 = 0;
}
+class TestRSL<string mnemonic, bits<16> opcode>
+ : InstRSLa<opcode, (outs), (ins bdladdr12onlylen4:$BDL1),
+ mnemonic#"\t$BDL1", []> {
+ let mayLoad = 1;
+}
+
+class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
+ : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1,
+ shift12only:$BD2, imm32zx4:$I3),
+ mnemonic##"\t$BDL1, $BD2, $I3", []>;
+
+class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1,
+ RegisterOperand cls2,
+ RegisterOperand cls3>
+ : InstRRFb<opcode, (outs cls1:$R1, cls2:$R2, cls3:$R3),
+ (ins cls1:$R1src, cls2:$R2src, cls3:$R3src),
+ mnemonic#"\t$R1, $R3, $R2", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src, $R3 = $R3src";
+ let DisableEncoding = "$R1src, $R2src, $R3src";
+ let M4 = 0;
+}
+
class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
RegisterOperand cls1, RegisterOperand cls2,
Immediate imm>
: InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
mnemonic#"\t$R1, $R2, $M3", []>;
+class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2,
+ Immediate imm>
+ : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2),
+ (ins cls1:$R1src, cls2:$R2src, imm:$M3),
+ mnemonic#"\t$R1, $R2, $M3", []> {
+ let Constraints = "$R1 = $R1src, $R2 = $R2src";
+ let DisableEncoding = "$R1src, $R2src";
+}
+
+multiclass SideEffectTernaryMemMemRRFcOpt<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1,
+ RegisterOperand cls2> {
+ def "" : SideEffectTernaryMemMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+ def Opt : SideEffectBinaryMemMemRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
class SideEffectTernarySSF<string mnemonic, bits<12> opcode,
RegisterOperand cls>
: InstSSF<opcode, (outs),
(ins bdaddr12only:$BD1, bdaddr12only:$BD2, cls:$R3),
mnemonic#"\t$BD1, $BD2, $R3", []>;
+class TernaryRRFb<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2,
+ RegisterOperand cls3>
+ : InstRRFb<opcode, (outs cls1:$R1, cls3:$R3),
+ (ins cls1:$R1src, cls2:$R2, imm32zx4:$M4),
+ mnemonic#"\t$R1, $R3, $R2, $M4", []> {
+ let Constraints = "$R1 = $R1src";
+ let DisableEncoding = "$R1src";
+}
+
class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
RegisterOperand cls2>
: InstRRFe<opcode, (outs cls1:$R1),
@@ -3376,6 +3642,24 @@ multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
}
}
+class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRSa<opcode, (outs cls1:$R1, cls2:$R3),
+ (ins cls1:$R1src, cls2:$R3src, shift12only:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2", []> {
+ let Constraints = "$R1 = $R1src, $R3 = $R3src";
+ let DisableEncoding = "$R1src, $R3src";
+}
+
+class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode,
+ RegisterOperand cls1, RegisterOperand cls2>
+ : InstRSYa<opcode, (outs cls1:$R1, cls2:$R3),
+ (ins cls1:$R1src, cls2:$R3src, shift20only:$BD2),
+ mnemonic#"\t$R1, $R3, $BD2", []> {
+ let Constraints = "$R1 = $R1src, $R3 = $R3src";
+ let DisableEncoding = "$R1src, $R3src";
+}
+
class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
: InstRXF<opcode, (outs cls:$R1),
@@ -3981,9 +4265,7 @@ class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
// another instruction to handle the excess.
multiclass MemorySS<string mnemonic, bits<8> opcode,
SDPatternOperator sequence, SDPatternOperator loop> {
- def "" : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
- bdaddr12only:$BD2),
- mnemonic##"\t$BDL1, $BD2", []>;
+ def "" : SideEffectBinarySSa<mnemonic, opcode>;
let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
imm64:$length),
@@ -4003,13 +4285,8 @@ multiclass MemorySS<string mnemonic, bits<8> opcode,
// the full loop (the main instruction plus the branch on CC==3).
multiclass StringRRE<string mnemonic, bits<16> opcode,
SDPatternOperator operator> {
- def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
- (ins GR64:$R1src, GR64:$R2src),
- mnemonic#"\t$R1, $R2", []> {
- let Uses = [R0L];
- let Constraints = "$R1 = $R1src, $R2 = $R2src";
- let DisableEncoding = "$R1src, $R2src";
- }
+ let Uses = [R0L] in
+ def "" : SideEffectBinaryMemMemRRE<mnemonic, opcode, GR64, GR64>;
let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
def Loop : Pseudo<(outs GR64:$end),
(ins GR64:$start1, GR64:$start2, GR32:$char),
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index d63525f29412..fa5ecdd85243 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -12,8 +12,8 @@
//===----------------------------------------------------------------------===//
let hasNoSchedulingInfo = 1 in {
- def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
- [(callseq_start timm:$amt)]>;
+ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+ [(callseq_start timm:$amt1, timm:$amt2)]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
[(callseq_end timm:$amt1, timm:$amt2)]>;
}
@@ -464,6 +464,11 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store, imm64sx16>;
// Memory-to-memory moves.
let mayLoad = 1, mayStore = 1 in
defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+ def MVCL : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>;
+ def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>;
+ def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
+}
// String moves.
let mayLoad = 1, mayStore = 1, Defs = [CC] in
@@ -707,6 +712,10 @@ def : StoreGR64PC<STHRL, aligned_truncstorei16>;
defm : StoreGR64Pair<ST, STY, truncstorei32>;
def : StoreGR64PC<STRL, aligned_truncstorei32>;
+// Store characters under mask -- not (yet) used for codegen.
+defm STCM : StoreBinaryRSPair<"stcm", 0xBE, 0xEB2D, GR32, 0>;
+def STCMH : StoreBinaryRSY<"stcmh", 0xEB2C, GRH32, 0>;
+
//===----------------------------------------------------------------------===//
// Multi-register moves
//===----------------------------------------------------------------------===//
@@ -715,6 +724,7 @@ def : StoreGR64PC<STRL, aligned_truncstorei32>;
defm LM : LoadMultipleRSPair<"lm", 0x98, 0xEB98, GR32>;
def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>;
def LMH : LoadMultipleRSY<"lmh", 0xEB96, GRH32>;
+def LMD : LoadMultipleSSe<"lmd", 0xEF, GR64>;
// Multi-register stores.
defm STM : StoreMultipleRSPair<"stm", 0x90, 0xEB90, GR32>;
@@ -742,6 +752,10 @@ def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>;
def STRV : StoreRXY<"strv", 0xE33E, z_strv, GR32, 4>;
def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>;
+// Byte-swapping memory-to-memory moves.
+let mayLoad = 1, mayStore = 1 in
+ def MVCIN : SideEffectBinarySSa<"mvcin", 0xE8>;
+
//===----------------------------------------------------------------------===//
// Load address instructions
//===----------------------------------------------------------------------===//
@@ -816,6 +830,7 @@ defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
defm : InsertMem<"inserti8", IC, GR64, azextloadi8, bdxaddr12pair>;
defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
+// Insert characters under mask -- not (yet) used for codegen.
let Defs = [CC] in {
defm ICM : TernaryRSPair<"icm", 0xBF, 0xEB81, GR32, 0>;
def ICMH : TernaryRSY<"icmh", 0xEB80, GRH32, 0>;
@@ -919,6 +934,10 @@ let Defs = [CC] in {
defm AL : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
def ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
def ALG : BinaryRXY<"alg", 0xE30A, addc, GR64, load, 8>;
+
+ // Addition to memory.
+ def ALSI : BinarySIY<"alsi", 0xEB6E, null_frag, imm32sx8>;
+ def ALGSI : BinarySIY<"algsi", 0xEB7E, null_frag, imm64sx8>;
}
defm : ZXB<addc, GR64, ALGFR>;
@@ -1166,9 +1185,14 @@ def MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
def MSG : BinaryRXY<"msg", 0xE30C, mul, GR64, load, 8>;
// Multiplication of a register, producing two results.
+def MR : BinaryRR <"mr", 0x1C, null_frag, GR128, GR32>;
+def MLR : BinaryRRE<"mlr", 0xB996, null_frag, GR128, GR32>;
def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>;
// Multiplication of memory, producing two results.
+def M : BinaryRX <"m", 0x5C, null_frag, GR128, load, 4>;
+def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
+def ML : BinaryRXY<"ml", 0xE396, null_frag, GR128, load, 4>;
def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
//===----------------------------------------------------------------------===//
@@ -1177,12 +1201,14 @@ def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
let hasSideEffects = 1 in { // Do not speculatively execute.
// Division and remainder, from registers.
+ def DR : BinaryRR <"dr", 0x1D, null_frag, GR128, GR32>;
def DSGFR : BinaryRRE<"dsgfr", 0xB91D, z_sdivrem32, GR128, GR32>;
def DSGR : BinaryRRE<"dsgr", 0xB90D, z_sdivrem64, GR128, GR64>;
def DLR : BinaryRRE<"dlr", 0xB997, z_udivrem32, GR128, GR32>;
def DLGR : BinaryRRE<"dlgr", 0xB987, z_udivrem64, GR128, GR64>;
// Division and remainder, from memory.
+ def D : BinaryRX <"d", 0x5D, null_frag, GR128, load, 4>;
def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>;
def DSG : BinaryRXY<"dsg", 0xE30D, z_sdivrem64, GR128, load, 8>;
def DL : BinaryRXY<"dl", 0xE397, z_udivrem32, GR128, load, 4>;
@@ -1193,23 +1219,32 @@ let hasSideEffects = 1 in { // Do not speculatively execute.
// Shifts
//===----------------------------------------------------------------------===//
-// Shift left.
+// Logical shift left.
let hasSideEffects = 0 in {
defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
- defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
+ def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>;
+}
+
+// Arithmetic shift left.
+let Defs = [CC] in {
+ defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
+ def SLAG : BinaryRSY<"slag", 0xEB0B, null_frag, GR64>;
+ def SLDA : BinaryRS<"slda", 0x8F, null_frag, GR128>;
}
// Logical shift right.
let hasSideEffects = 0 in {
defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
+ def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>;
}
// Arithmetic shift right.
let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>;
+ def SRDA : BinaryRS<"srda", 0x8E, null_frag, GR128>;
}
// Rotate left.
@@ -1351,8 +1386,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
defm : ZXB<z_ucmp, GR64, CLGFR>;
// Memory-to-memory comparison.
-let mayLoad = 1, Defs = [CC] in
+let mayLoad = 1, Defs = [CC] in {
defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+ def CLCL : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>;
+ def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>;
+ def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>;
+}
// String comparison.
let mayLoad = 1, Defs = [CC] in
@@ -1381,6 +1420,12 @@ let Defs = [CC] in {
def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
def TMH : InstAlias<"tmh\t$R, $I", (TMLH GR32:$R, imm32lh16:$I), 0>;
+// Compare logical characters under mask -- not (yet) used for codegen.
+let Defs = [CC] in {
+ defm CLM : CompareRSPair<"clm", 0xBD, 0xEB21, GR32, 0>;
+ def CLMH : CompareRSY<"clmh", 0xEB20, GRH32, 0>;
+}
+
//===----------------------------------------------------------------------===//
// Prefetch and execution hint
//===----------------------------------------------------------------------===//
@@ -1581,6 +1626,115 @@ let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
}
//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1 in
+ def TR : SideEffectBinarySSa<"tr", 0xDC>;
+
+let mayLoad = 1, Defs = [CC, R0L, R1D] in {
+ def TRT : SideEffectBinarySSa<"trt", 0xDD>;
+ def TRTR : SideEffectBinarySSa<"trtr", 0xD0>;
+}
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L] in
+ def TRE : SideEffectBinaryMemMemRRE<"tre", 0xB2A5, GR128, GR64>;
+
+let mayLoad = 1, Uses = [R1D], Defs = [CC] in {
+ defm TRTE : BinaryMemRRFcOpt<"trte", 0xB9BF, GR128, GR64>;
+ defm TRTRE : BinaryMemRRFcOpt<"trtre", 0xB9BD, GR128, GR64>;
+}
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
+ defm TROO : SideEffectTernaryMemMemRRFcOpt<"troo", 0xB993, GR128, GR64>;
+ defm TROT : SideEffectTernaryMemMemRRFcOpt<"trot", 0xB992, GR128, GR64>;
+ defm TRTO : SideEffectTernaryMemMemRRFcOpt<"trto", 0xB991, GR128, GR64>;
+ defm TRTT : SideEffectTernaryMemMemRRFcOpt<"trtt", 0xB990, GR128, GR64>;
+}
+
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+ defm CU12 : SideEffectTernaryMemMemRRFcOpt<"cu12", 0xB2A7, GR128, GR128>;
+ defm CU14 : SideEffectTernaryMemMemRRFcOpt<"cu14", 0xB9B0, GR128, GR128>;
+ defm CU21 : SideEffectTernaryMemMemRRFcOpt<"cu21", 0xB2A6, GR128, GR128>;
+ defm CU24 : SideEffectTernaryMemMemRRFcOpt<"cu24", 0xB9B1, GR128, GR128>;
+ def CU41 : SideEffectBinaryMemMemRRE<"cu41", 0xB9B2, GR128, GR128>;
+ def CU42 : SideEffectBinaryMemMemRRE<"cu42", 0xB9B3, GR128, GR128>;
+
+ let isAsmParserOnly = 1 in {
+ defm CUUTF : SideEffectTernaryMemMemRRFcOpt<"cuutf", 0xB2A6, GR128, GR128>;
+ defm CUTFU : SideEffectTernaryMemMemRRFcOpt<"cutfu", 0xB2A7, GR128, GR128>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
+ def KM : SideEffectBinaryMemMemRRE<"km", 0xB92E, GR128, GR128>;
+ def KMC : SideEffectBinaryMemMemRRE<"kmc", 0xB92F, GR128, GR128>;
+
+ def KIMD : SideEffectBinaryMemRRE<"kimd", 0xB93E, GR64, GR128>;
+ def KLMD : SideEffectBinaryMemRRE<"klmd", 0xB93F, GR64, GR128>;
+ def KMAC : SideEffectBinaryMemRRE<"kmac", 0xB91E, GR64, GR128>;
+
+ let Predicates = [FeatureMessageSecurityAssist4] in {
+ def KMF : SideEffectBinaryMemMemRRE<"kmf", 0xB92A, GR128, GR128>;
+ def KMO : SideEffectBinaryMemMemRRE<"kmo", 0xB92B, GR128, GR128>;
+ def KMCTR : SideEffectTernaryMemMemMemRRFb<"kmctr", 0xB92D,
+ GR128, GR128, GR128>;
+ def PCC : SideEffectInherentRRE<"pcc", 0xB92C>;
+ }
+ let Predicates = [FeatureMessageSecurityAssist5] in
+ def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+defm CVB : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, load, 4>;
+def CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, load, 8>;
+
+defm CVD : StoreRXPair<"cvd", 0x4E, 0xE326, null_frag, GR32, 4>;
+def CVDG : StoreRXY<"cvdg", 0xE32E, null_frag, GR64, 8>;
+
+let mayLoad = 1, mayStore = 1 in {
+ def MVN : SideEffectBinarySSa<"mvn", 0xD1>;
+ def MVZ : SideEffectBinarySSa<"mvz", 0xD3>;
+ def MVO : SideEffectBinarySSb<"mvo", 0xF1>;
+
+ def PACK : SideEffectBinarySSb<"pack", 0xF2>;
+ def PKA : SideEffectBinarySSf<"pka", 0xE9>;
+ def PKU : SideEffectBinarySSf<"pku", 0xE1>;
+ def UNPK : SideEffectBinarySSb<"unpk", 0xF3>;
+ let Defs = [CC] in {
+ def UNPKA : SideEffectBinarySSa<"unpka", 0xEA>;
+ def UNPKU : SideEffectBinarySSa<"unpku", 0xE2>;
+ }
+}
+
+let mayLoad = 1, mayStore = 1 in {
+ let Defs = [CC] in {
+ def AP : SideEffectBinarySSb<"ap", 0xFA>;
+ def SP : SideEffectBinarySSb<"sp", 0xFB>;
+ def ZAP : SideEffectBinarySSb<"zap", 0xF8>;
+ def SRP : SideEffectTernarySSc<"srp", 0xF0>;
+ }
+ def MP : SideEffectBinarySSb<"mp", 0xFC>;
+ def DP : SideEffectBinarySSb<"dp", 0xFD>;
+ let Defs = [CC] in {
+ def ED : SideEffectBinarySSa<"ed", 0xDE>;
+ def EDMK : SideEffectBinarySSa<"edmk", 0xDF>;
+ }
+}
+
+let Defs = [CC] in {
+ def CP : CompareSSb<"cp", 0xF9>;
+ def TP : TestRSL<"tp", 0xEBC0>;
+}
+
+//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
@@ -1712,12 +1866,39 @@ let usesCustomInserter = 1 in {
// Search a block of memory for a character.
let mayLoad = 1, Defs = [CC] in
- defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+ defm SRST : StringRRE<"srst", 0xB25E, z_search_string>;
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+ def SRSTU : SideEffectBinaryMemMemRRE<"srstu", 0xB9BE, GR64, GR64>;
+
+// Compare until substring equal.
+let mayLoad = 1, Defs = [CC], Uses = [R0L, R1L] in
+ def CUSE : SideEffectBinaryMemMemRRE<"cuse", 0xB257, GR128, GR128>;
+
+// Compare and form codeword.
+let mayLoad = 1, Defs = [CC, R1D, R2D, R3D], Uses = [R1D, R2D, R3D] in
+ def CFC : SideEffectAddressS<"cfc", 0xB21A, null_frag>;
+
+// Update tree.
+let mayLoad = 1, mayStore = 1, Defs = [CC, R0D, R1D, R2D, R3D, R5D],
+ Uses = [R0D, R1D, R2D, R3D, R4D, R5D] in
+ def UPT : SideEffectInherentE<"upt", 0x0102>;
+
+// Checksum.
+let mayLoad = 1, Defs = [CC] in
+ def CKSM : SideEffectBinaryMemMemRRE<"cksm", 0xB241, GR64, GR128>;
+
+// Compression call.
+let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in
+ def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>;
// Supervisor call.
let hasSideEffects = 1, isCall = 1, Defs = [CC] in
def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
+// Monitor call.
+let hasSideEffects = 1, isCall = 1 in
+ def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>;
+
// Store clock.
let hasSideEffects = 1, Defs = [CC] in {
def STCK : StoreInherentS<"stck", 0xB205, null_frag, 8>;
@@ -1729,10 +1910,18 @@ let hasSideEffects = 1, Defs = [CC] in {
let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
+// Extract CPU attribute.
+let hasSideEffects = 1 in
+ def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>;
+
// Extract CPU time.
let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in
def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
+// Extract PSW.
+let hasSideEffects = 1, Uses = [CC] in
+ def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>;
+
// Execute.
let hasSideEffects = 1 in {
def EX : SideEffectBinaryRX<"ex", 0x44, GR64>;
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 7bb4fe5afb3f..713612129d90 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -531,6 +531,7 @@ def BDAddr64Disp12 : AddressAsmOperand<"BDAddr", "64", "12">;
def BDAddr64Disp20 : AddressAsmOperand<"BDAddr", "64", "20">;
def BDXAddr64Disp12 : AddressAsmOperand<"BDXAddr", "64", "12">;
def BDXAddr64Disp20 : AddressAsmOperand<"BDXAddr", "64", "20">;
+def BDLAddr64Disp12Len4 : AddressAsmOperand<"BDLAddr", "64", "12", "Len4">;
def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr", "64", "12", "Len8">;
def BDRAddr64Disp12 : AddressAsmOperand<"BDRAddr", "64", "12">;
def BDVAddr64Disp12 : AddressAsmOperand<"BDVAddr", "64", "12">;
@@ -578,6 +579,7 @@ def bdxaddr20pair : BDXMode<"BDXAddr", "64", "20", "Pair">;
def dynalloc12only : BDXMode<"DynAlloc", "64", "12", "Only">;
def laaddr12pair : BDXMode<"LAAddr", "64", "12", "Pair">;
def laaddr20pair : BDXMode<"LAAddr", "64", "20", "Pair">;
+def bdladdr12onlylen4 : BDLMode<"BDLAddr", "64", "12", "Only", "4">;
def bdladdr12onlylen8 : BDLMode<"BDLAddr", "64", "12", "Only", "8">;
def bdraddr12only : BDRMode<"BDRAddr", "64", "12", "Only">;
def bdvaddr12only : BDVMode< "64", "12">;
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index fde26ed4e1c5..adfc69c5d4cf 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -10,7 +10,8 @@
//===----------------------------------------------------------------------===//
// Type profiles
//===----------------------------------------------------------------------===//
-def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>]>;
+def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>,
+ SDTCisVT<1, i64>]>;
def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i64>,
SDTCisVT<1, i64>]>;
def SDT_ZCall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td
index dbba8ab42b5a..1ce0168f95e9 100644
--- a/lib/Target/SystemZ/SystemZSchedule.td
+++ b/lib/Target/SystemZ/SystemZSchedule.td
@@ -56,12 +56,16 @@ def LSU_lat1 : SchedWrite;
// Floating point unit (zEC12 and earlier)
def FPU : SchedWrite;
def FPU2 : SchedWrite;
+def DFU : SchedWrite;
+def DFU2 : SchedWrite;
// Vector sub units (z13)
def VecBF : SchedWrite;
def VecBF2 : SchedWrite;
def VecDF : SchedWrite;
def VecDF2 : SchedWrite;
+def VecDFX : SchedWrite;
+def VecDFX2 : SchedWrite;
def VecFPd : SchedWrite; // Blocking BFP div/sqrt unit.
def VecMul : SchedWrite;
def VecStr : SchedWrite;
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index 7aee6f52e9a7..612c3b6cf96e 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -76,6 +76,8 @@ def : WriteRes<VecBF, [Z13_VecUnit]> { let Latency = 8; }
def : WriteRes<VecBF2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
def : WriteRes<VecDF, [Z13_VecUnit]> { let Latency = 8; }
def : WriteRes<VecDF2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDFX, [Z13_VecUnit]> { let Latency = 1; }
+def : WriteRes<VecDFX2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 2; }
def : WriteRes<VecFPd, [Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
@@ -179,6 +181,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
// Move character
def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
// Pseudo -> reg move
def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -268,6 +271,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Multi-register moves
@@ -277,6 +281,9 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
(instregex "LM(H|Y|G)?$")>;
+// Load multiple disjoint
+def : InstRW<[FXb, Lat30, GroupAlone], (instregex "LMD$")>;
+
// Store multiple (estimated average of ceil(5/2) FXb ops)
def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
GroupAlone], (instregex "STM(G|H|Y)?$")>;
@@ -288,6 +295,7 @@ def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
//===----------------------------------------------------------------------===//
// Load address instructions
@@ -345,7 +353,7 @@ def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
def : InstRW<[FXa], (instregex "ALR(K)?$")>;
def : InstRW<[FXa], (instregex "AR(K)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "A(G)?SI$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
// Logical addition with carry
def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
@@ -438,11 +446,15 @@ def : InstRW<[FXa, Lat9, GroupAlone], (instregex "MLGR$")>;
def : InstRW<[FXa, Lat5], (instregex "MGHI$")>;
def : InstRW<[FXa, Lat5], (instregex "MHI$")>;
def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXa, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXa, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
//===----------------------------------------------------------------------===//
// Division and remainder
//===----------------------------------------------------------------------===//
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
def : InstRW<[FXa, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
def : InstRW<[LSU, FXa, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
@@ -456,7 +468,8 @@ def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SLA(K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXa, FXa, FXa, FXa, Lat8], (instregex "S(L|R)D(A|L)$")>;
// Rotate
def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -505,7 +518,7 @@ def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
// Compare logical character
def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
// Test under mask
@@ -516,6 +529,9 @@ def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+// Compare logical characters under mask
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
//===----------------------------------------------------------------------===//
// Prefetch and execution hint
//===----------------------------------------------------------------------===//
@@ -563,6 +579,42 @@ def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXb, VecDF, FXb, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXb, VecDFX, LSU, LSU, Lat9, GroupAlone],
+ (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXb, VecDFX2, LSU, LSU, Lat30, GroupAlone],
+ (instregex "(M|D)P$")>;
+def : InstRW<[FXb, FXb, VecDFX2, LSU, LSU, LSU, Lat15, GroupAlone],
+ (instregex "SRP$")>;
+def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
+def : InstRW<[VecDFX, LSU, Lat4, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
@@ -640,13 +692,30 @@ def : InstRW<[FXa], (instregex "ZEXT128_(32|64)$")>;
// String instructions
def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
// Move with key
def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+// Monitor call
+def : InstRW<[FXb], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXb, Lat30], (instregex "ECAG$")>;
+
// Extract CPU Time
def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>;
+// Extract PSW
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+
// Execute
def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
@@ -811,14 +880,17 @@ def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+// Divide to integer
+def : InstRW<[VecFPd, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
//===----------------------------------------------------------------------===//
// FP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)B$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)BR?$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXBR$")>;
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
// Test Data Class
def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index a950e54e7601..670df8ff5541 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
def Z196_FXUnit : ProcResource<2>;
def Z196_LSUnit : ProcResource<2>;
def Z196_FPUnit : ProcResource<1>;
+def Z196_DFUnit : ProcResource<1>;
// Subtarget specific definitions of scheduling resources.
def : WriteRes<FXU, [Z196_FXUnit]> { let Latency = 1; }
@@ -66,6 +67,8 @@ def : WriteRes<LSU, [Z196_LSUnit]> { let Latency = 4; }
def : WriteRes<LSU_lat1, [Z196_LSUnit]> { let Latency = 1; }
def : WriteRes<FPU, [Z196_FPUnit]> { let Latency = 8; }
def : WriteRes<FPU2, [Z196_FPUnit, Z196_FPUnit]> { let Latency = 9; }
+def : WriteRes<DFU, [Z196_DFUnit]> { let Latency = 2; }
+def : WriteRes<DFU2, [Z196_DFUnit, Z196_DFUnit]> { let Latency = 3; }
// -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -152,6 +155,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
// Move character
def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
// Pseudo -> reg move
def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -226,6 +230,7 @@ def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Multi-register moves
@@ -235,6 +240,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
(instregex "LM(H|Y|G)?$")>;
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
// Store multiple (estimated average of 3 ops)
def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
(instregex "STM(H|Y|G)?$")>;
@@ -246,6 +254,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
//===----------------------------------------------------------------------===//
// Load address instructions
@@ -285,7 +294,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>;
// Addition
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>;
def : InstRW<[FXU], (instregex "AIH$")>;
def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
@@ -294,15 +303,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
def : InstRW<[FXU], (instregex "AGR(K)?$")>;
def : InstRW<[FXU], (instregex "AHI(K)?$")>;
def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
def : InstRW<[FXU], (instregex "ALGHSIK$")>;
def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
def : InstRW<[FXU], (instregex "ALR(K)?$")>;
def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
// Logical addition with carry
def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
@@ -395,11 +403,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
//===----------------------------------------------------------------------===//
// Division and remainder
//===----------------------------------------------------------------------===//
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "D$")>;
def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
(instregex "DSG(F)?R$")>;
def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
@@ -416,7 +430,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU, Lat2], (instregex "SLA(K)?$")>;
+def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
// Rotate
def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -465,7 +480,7 @@ def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>;
// Compare logical character
def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
// Test under mask
@@ -476,6 +491,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+// Compare logical characters under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
//===----------------------------------------------------------------------===//
// Prefetch
//===----------------------------------------------------------------------===//
@@ -520,6 +538,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+ (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+ (instregex "(M|D)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+ (instregex "SRP$")>;
+def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
@@ -571,13 +625,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
// String instructions
def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
// Move with key
def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+// Monitor call
+def : InstRW<[FXU], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
+
// Extract CPU Time
def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
+// Extract PSW
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+
// Execute
def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -740,14 +811,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+// Divide to integer
+def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
//===----------------------------------------------------------------------===//
// FP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
-def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
// Test Data Class
def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 8ab6c826f1ed..1bdb8779dc72 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
def ZEC12_FXUnit : ProcResource<2>;
def ZEC12_LSUnit : ProcResource<2>;
def ZEC12_FPUnit : ProcResource<1>;
+def ZEC12_DFUnit : ProcResource<1>;
def ZEC12_VBUnit : ProcResource<1>;
// Subtarget specific definitions of scheduling resources.
@@ -67,6 +68,8 @@ def : WriteRes<LSU, [ZEC12_LSUnit]> { let Latency = 4; }
def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
def : WriteRes<FPU, [ZEC12_FPUnit]> { let Latency = 8; }
def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
+def : WriteRes<DFU, [ZEC12_DFUnit]> { let Latency = 2; }
+def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_FPUnit]> { let Latency = 3; }
def : WriteRes<VBU, [ZEC12_VBUnit]>; // Virtual Branching Unit
// -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -155,6 +158,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
// Move character
def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
// Pseudo -> reg move
def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -236,6 +240,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
//===----------------------------------------------------------------------===//
// Multi-register moves
@@ -245,6 +250,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
(instregex "LM(H|Y|G)?$")>;
+// Load multiple disjoint
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>;
+
// Store multiple (estimated average of 3 ops)
def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
(instregex "STM(H|Y|G)?$")>;
@@ -256,6 +264,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
//===----------------------------------------------------------------------===//
// Load address instructions
@@ -295,7 +304,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>;
// Addition
//===----------------------------------------------------------------------===//
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>;
def : InstRW<[FXU], (instregex "AIH$")>;
def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
@@ -304,15 +313,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
def : InstRW<[FXU], (instregex "AGR(K)?$")>;
def : InstRW<[FXU], (instregex "AHI(K)?$")>;
def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
def : InstRW<[FXU], (instregex "ALGHSIK$")>;
def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
def : InstRW<[FXU], (instregex "ALR(K)?$")>;
def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
// Logical addition with carry
def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
@@ -405,11 +413,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
//===----------------------------------------------------------------------===//
// Division and remainder
//===----------------------------------------------------------------------===//
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+ (instregex "D$")>;
def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
(instregex "DSG(F)?R$")>;
def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
@@ -426,7 +440,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SLA(K)?$")>;
+def : InstRW<[FXU], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>;
// Rotate
def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -475,7 +490,7 @@ def : InstRW<[FXU, Lat2], (instregex "CGFR$")>;
// Compare logical character
def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
// Test under mask
@@ -486,6 +501,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
+// Compare logical characters under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
//===----------------------------------------------------------------------===//
// Prefetch and execution hint
//===----------------------------------------------------------------------===//
@@ -532,6 +550,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>;
+def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>;
+
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone],
+ (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+ (instregex "(M|D)P$")>;
+def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone],
+ (instregex "SRP$")>;
+def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
// Access registers
//===----------------------------------------------------------------------===//
@@ -609,13 +663,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
// String instructions
def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
// Move with key
def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
+// Monitor call
+def : InstRW<[FXU], (instregex "MC$")>;
+
+// Extract CPU attribute
+def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
+
// Extract CPU Time
def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
+// Extract PSW
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+
// Execute
def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
@@ -778,14 +849,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
+// Divide to integer
+def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>;
+
//===----------------------------------------------------------------------===//
// FP: Comparisons
//===----------------------------------------------------------------------===//
// Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
-def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
// Test Data Class
def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index ce07ea3318a5..022679a7bc18 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -37,12 +37,13 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
const TargetMachine &TM)
: SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
- HasPopulationCount(false), HasFastSerialization(false),
- HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
+ HasPopulationCount(false), HasMessageSecurityAssist4(false),
+ HasFastSerialization(false), HasInterlockedAccess1(false),
+ HasMiscellaneousExtensions(false),
HasExecutionHint(false), HasLoadAndTrap(false),
HasTransactionalExecution(false), HasProcessorAssist(false),
HasVector(false), HasLoadStoreOnCond2(false),
- HasLoadAndZeroRightmostByte(false),
+ HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
TLInfo(TM, *this), TSInfo(), FrameLowering() {}
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index cdb61327a16a..770dd7cd939f 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -39,6 +39,7 @@ protected:
bool HasHighWord;
bool HasFPExtension;
bool HasPopulationCount;
+ bool HasMessageSecurityAssist4;
bool HasFastSerialization;
bool HasInterlockedAccess1;
bool HasMiscellaneousExtensions;
@@ -49,6 +50,7 @@ protected:
bool HasVector;
bool HasLoadStoreOnCond2;
bool HasLoadAndZeroRightmostByte;
+ bool HasMessageSecurityAssist5;
private:
Triple TargetTriple;
@@ -104,6 +106,10 @@ public:
// Return true if the target has the population-count facility.
bool hasPopulationCount() const { return HasPopulationCount; }
+ // Return true if the target has the message-security-assist
+ // extension facility 4.
+ bool hasMessageSecurityAssist4() const { return HasMessageSecurityAssist4; }
+
// Return true if the target has the fast-serialization facility.
bool hasFastSerialization() const { return HasFastSerialization; }
@@ -132,6 +138,10 @@ public:
return HasLoadAndZeroRightmostByte;
}
+ // Return true if the target has the message-security-assist
+ // extension facility 5.
+ bool hasMessageSecurityAssist5() const { return HasMessageSecurityAssist5; }
+
// Return true if the target has the vector facility.
bool hasVector() const { return HasVector; }
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 73d1d4be293b..6b45839c14b0 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -19,8 +19,8 @@ let Defs = [ARGUMENTS] in {
// Call sequence markers. These have an immediate which represents the amount of
// stack space to allocate or free, which is used for varargs lowering.
let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
- [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
+ [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>;
def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
[(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
} // isCodeGenOnly = 1
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index a601b575f579..fa2146f7db84 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -25,7 +25,8 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
// WebAssembly-specific DAG Node Types.
//===----------------------------------------------------------------------===//
-def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
+ SDTCisVT<1, iPTR>]>;
def SDT_WebAssemblyCallSeqEnd :
SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 784c3a6557ff..3a421fe77392 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
+def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
+ "LEA instruction with 3 ops or certain registers is slow">;
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
def FeatureSoftFloat
@@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureLAHFSAHF,
+ FeatureSlow3OpsLEA,
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate
]>;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index ebd179e786da..fc3b4836c178 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -180,44 +180,6 @@ private:
} // end anonymous namespace.
-static std::pair<X86::CondCode, bool>
-getX86ConditionCode(CmpInst::Predicate Predicate) {
- X86::CondCode CC = X86::COND_INVALID;
- bool NeedSwap = false;
- switch (Predicate) {
- default: break;
- // Floating-point Predicates
- case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
- case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
- case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
- case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
- case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
- case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
- case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
- case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
- case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
- case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
- case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
- case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
- case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH;
- case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
-
- // Integer Predicates
- case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
- case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
- case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
- case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
- case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
- case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
- case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
- case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
- case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
- case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
- }
-
- return std::make_pair(CC, NeedSwap);
-}
-
static std::pair<unsigned, bool>
getX86SSEConditionCode(CmpInst::Predicate Predicate) {
unsigned CC;
@@ -1559,7 +1521,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
X86::CondCode CC;
bool SwapArgs;
- std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+ std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
unsigned Opc = X86::getSETFromCond(CC);
@@ -1697,7 +1659,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
bool SwapArgs;
unsigned BranchOpc;
- std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+ std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
BranchOpc = X86::GetCondBranchFromCond(CC);
@@ -2070,7 +2032,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
}
bool NeedSwap;
- std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+ std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
const Value *CmpLHS = CI->getOperand(0);
@@ -2319,7 +2281,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
const auto *CI = dyn_cast<CmpInst>(Cond);
if (CI && (CI->getParent() == I->getParent())) {
bool NeedSwap;
- std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+ std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
if (CC > X86::LAST_VALID_COND)
return false;
@@ -3293,7 +3255,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// Issue CALLSEQ_START
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
- .addImm(NumBytes).addImm(0);
+ .addImm(NumBytes).addImm(0).addImm(0);
// Walk the register/memloc assignments, inserting copies/loads.
const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 2cd4c1a3e7b3..9f649dad8bc0 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -27,20 +27,26 @@
#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
-#define DEBUG_TYPE "x86-fixup-LEAs"
+namespace llvm {
+void initializeFixupLEAPassPass(PassRegistry &);
+}
+
+#define FIXUPLEA_DESC "X86 LEA Fixup"
+#define FIXUPLEA_NAME "x86-fixup-LEAs"
+
+#define DEBUG_TYPE FIXUPLEA_NAME
STATISTIC(NumLEAs, "Number of LEA instructions created");
namespace {
class FixupLEAPass : public MachineFunctionPass {
enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
- static char ID;
+
/// \brief Loop over all of the instructions in the basic block
/// replacing applicable instructions with LEA instructions,
/// where appropriate.
bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
- StringRef getPassName() const override { return "X86 LEA Fixup"; }
/// \brief Given a machine register, look for the instruction
/// which writes it in the current basic block. If found,
@@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass {
void processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI);
+
+ /// \brief Given a LEA instruction which is unprofitable
+ /// on SNB+ try to replace it with other instructions.
+ /// According to Intel's Optimization Reference Manual:
+ /// " For LEA instructions with three source operands and some specific
+ /// situations, instruction latency has increased to 3 cycles, and must
+ /// dispatch via port 1:
+ /// - LEA that has all three source operands: base, index, and offset
+ /// - LEA that uses base and index registers where the base is EBP, RBP,
+ /// or R13
+ /// - LEA that uses RIP relative addressing mode
+ /// - LEA that uses 16-bit addressing mode "
+ /// This function currently handles the first 2 cases only.
+ MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
+ MachineFunction::iterator MFI);
+
/// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
/// and convert them to INC or DEC respectively.
bool fixupIncDec(MachineBasicBlock::iterator &I,
@@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass {
MachineBasicBlock::iterator &MBBI) const;
public:
- FixupLEAPass() : MachineFunctionPass(ID) {}
+ static char ID;
+
+ StringRef getPassName() const override { return FIXUPLEA_DESC; }
+
+ FixupLEAPass() : MachineFunctionPass(ID) {
+ initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
+ }
/// \brief Loop over all of the basic blocks,
/// replacing instructions by equivalent LEA instructions
@@ -104,9 +132,12 @@ private:
bool OptIncDec;
bool OptLEA;
};
-char FixupLEAPass::ID = 0;
}
+char FixupLEAPass::ID = 0;
+
+INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
+
MachineInstr *
FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
MachineBasicBlock::iterator &MBBI) const {
@@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
MF = &Func;
const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
- OptLEA = ST.LEAusesAG() || ST.slowLEA();
+ OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
if (!OptLEA && !OptIncDec)
return false;
@@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
return MachineBasicBlock::iterator();
}
-static inline bool isLEA(const int opcode) {
- return opcode == X86::LEA16r || opcode == X86::LEA32r ||
- opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+static inline bool isLEA(const int Opcode) {
+ return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+ Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+static inline bool isInefficientLEAReg(unsigned int Reg) {
+ return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13;
+}
+
+static inline bool isRegOperand(const MachineOperand &Op) {
+ return Op.isReg() && Op.getReg() != X86::NoRegister;
+}
+/// hasIneffecientLEARegs - LEA that uses base and index registers
+/// where the base is EBP, RBP, or R13
+static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
+ const MachineOperand &Index) {
+ return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
+ isRegOperand(Index);
+}
+
+static inline bool hasLEAOffset(const MachineOperand &Offset) {
+ return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
+}
+
+// LEA instruction that has all three operands: offset, base and index
+static inline bool isThreeOperandsLEA(const MachineOperand &Base,
+ const MachineOperand &Index,
+ const MachineOperand &Offset) {
+ return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset);
+}
+
+static inline int getADDrrFromLEA(int LEAOpcode) {
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA16r:
+ return X86::ADD16rr;
+ case X86::LEA32r:
+ return X86::ADD32rr;
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ return X86::ADD64rr;
+ }
+}
+
+static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
+ bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
+ switch (LEAOpcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA16r:
+ return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri;
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
+ case X86::LEA64r:
+ return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32;
+ }
}
/// isLEASimpleIncOrDec - Does this LEA have one these forms:
@@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) {
MachineInstr &MI = *I;
- const int opcode = MI.getOpcode();
- if (!isLEA(opcode))
+ const int Opcode = MI.getOpcode();
+ if (!isLEA(Opcode))
return;
if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
!TII->isSafeToClobberEFLAGS(*MFI, I))
@@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
return;
if (MI.getOperand(2).getImm() > 1)
return;
- int addrr_opcode, addri_opcode;
- switch (opcode) {
- default:
- llvm_unreachable("Unexpected LEA instruction");
- case X86::LEA16r:
- addrr_opcode = X86::ADD16rr;
- addri_opcode = X86::ADD16ri;
- break;
- case X86::LEA32r:
- addrr_opcode = X86::ADD32rr;
- addri_opcode = X86::ADD32ri;
- break;
- case X86::LEA64_32r:
- case X86::LEA64r:
- addrr_opcode = X86::ADD64rr;
- addri_opcode = X86::ADD64ri32;
- break;
- }
DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
DEBUG(dbgs() << "FixLEA: Replaced by: ";);
MachineInstr *NewMI = nullptr;
- const MachineOperand &Dst = MI.getOperand(0);
// Make ADD instruction for two registers writing to LEA's destination
if (SrcR1 != 0 && SrcR2 != 0) {
- const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
- const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
- NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
- .add(Dst)
- .add(Src1)
- .add(Src2);
- MFI->insert(I, NewMI);
+ const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
+ const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+ NewMI =
+ BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
DEBUG(NewMI->dump(););
}
// Make ADD instruction for immediate
if (MI.getOperand(4).getImm() != 0) {
+ const MCInstrDesc &ADDri =
+ TII->get(getADDriFromLEA(Opcode, MI.getOperand(4)));
const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
- NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
- .add(Dst)
+ NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
.add(SrcR)
.addImm(MI.getOperand(4).getImm());
- MFI->insert(I, NewMI);
DEBUG(NewMI->dump(););
}
if (NewMI) {
MFI->erase(I);
- I = static_cast<MachineBasicBlock::iterator>(NewMI);
+ I = NewMI;
+ }
+}
+
+MachineInstr *
+FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
+ MachineFunction::iterator MFI) {
+
+ const int LEAOpcode = MI.getOpcode();
+ if (!isLEA(LEAOpcode))
+ return nullptr;
+
+ const MachineOperand &Dst = MI.getOperand(0);
+ const MachineOperand &Base = MI.getOperand(1);
+ const MachineOperand &Scale = MI.getOperand(2);
+ const MachineOperand &Index = MI.getOperand(3);
+ const MachineOperand &Offset = MI.getOperand(4);
+ const MachineOperand &Segment = MI.getOperand(5);
+
+ if (!(isThreeOperandsLEA(Base, Index, Offset) ||
+ hasInefficientLEABaseReg(Base, Index)) ||
+ !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
+ Segment.getReg() != X86::NoRegister)
+ return nullptr;
+
+ unsigned int DstR = Dst.getReg();
+ unsigned int BaseR = Base.getReg();
+ unsigned int IndexR = Index.getReg();
+ unsigned SSDstR =
+ (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
+ bool IsScale1 = Scale.getImm() == 1;
+ bool IsInefficientBase = isInefficientLEAReg(BaseR);
+ bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+
+ // Skip these cases since it takes more than 2 instructions
+ // to replace the LEA instruction.
+ if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
+ return nullptr;
+ if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
+ (IsInefficientIndex || !IsScale1))
+ return nullptr;
+
+ const DebugLoc DL = MI.getDebugLoc();
+ const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
+ const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+
+ DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
+ DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+
+ // First try to replace LEA with one or two (for the 3-op LEA case)
+ // add instructions:
+ // 1.lea (%base,%index,1), %base => add %index,%base
+ // 2.lea (%base,%index,1), %index => add %base,%index
+ if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
+ const MachineOperand &Src = DstR == BaseR ? Index : Base;
+ MachineInstr *NewMI =
+ BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
+ DEBUG(NewMI->dump(););
+ // Create ADD instruction for the Offset in case of 3-Ops LEA.
+ if (hasLEAOffset(Offset)) {
+ NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+ DEBUG(NewMI->dump(););
+ }
+ return NewMI;
+ }
+ // If the base is inefficient try switching the index and base operands,
+ // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+ // lea offset(%base,%index,scale),%dst =>
+ // lea (%base,%index,scale); add offset,%dst
+ if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+ MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+ .add(Dst)
+ .add(IsInefficientBase ? Index : Base)
+ .add(Scale)
+ .add(IsInefficientBase ? Base : Index)
+ .addImm(0)
+ .add(Segment);
+ DEBUG(NewMI->dump(););
+ // Create ADD instruction for the Offset in case of 3-Ops LEA.
+ if (hasLEAOffset(Offset)) {
+ NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+ DEBUG(NewMI->dump(););
+ }
+ return NewMI;
+ }
+ // Handle the rest of the cases with inefficient base register:
+ assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+ assert(IsInefficientBase && "efficient base should be handled already!");
+
+ // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
+ if (IsScale1 && !hasLEAOffset(Offset)) {
+ TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill());
+ DEBUG(MI.getPrevNode()->dump(););
+
+ MachineInstr *NewMI =
+ BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+ DEBUG(NewMI->dump(););
+ return NewMI;
}
+ // lea offset(%base,%index,scale), %dst =>
+ // lea offset( ,%index,scale), %dst; add %base,%dst
+ MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+ .add(Dst)
+ .addReg(0)
+ .add(Scale)
+ .add(Index)
+ .add(Offset)
+ .add(Segment);
+ DEBUG(NewMI->dump(););
+
+ NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+ DEBUG(NewMI->dump(););
+ return NewMI;
}
bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
@@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
if (OptLEA) {
if (MF.getSubtarget<X86Subtarget>().isSLM())
processInstructionForSLM(I, MFI);
- else
- processInstruction(I, MFI);
+
+ else {
+ if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
+ if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
+ MFI->erase(I);
+ I = NewMI;
+ }
+ } else
+ processInstruction(I, MFI);
+ }
}
}
return false;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 12a10bf3072f..c899f0fd5100 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1178,8 +1178,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
break;
- if (ConstantSDNode
- *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
unsigned Val = CN->getZExtValue();
// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
// that the base operand remains free for further matching. If
@@ -1187,15 +1186,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
if (Val == 1 || Val == 2 || Val == 3) {
AM.Scale = 1 << Val;
- SDValue ShVal = N.getNode()->getOperand(0);
+ SDValue ShVal = N.getOperand(0);
// Okay, we know that we have a scale by now. However, if the scaled
// value is an add of something and a constant, we can fold the
// constant into the disp field here.
if (CurDAG->isBaseWithConstantOffset(ShVal)) {
- AM.IndexReg = ShVal.getNode()->getOperand(0);
- ConstantSDNode *AddVal =
- cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+ AM.IndexReg = ShVal.getOperand(0);
+ ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
if (!foldOffsetIntoAddress(Disp, AM))
return false;
@@ -1245,28 +1243,27 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
if (AM.BaseType == X86ISelAddressMode::RegBase &&
AM.Base_Reg.getNode() == nullptr &&
AM.IndexReg.getNode() == nullptr) {
- if (ConstantSDNode
- *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
CN->getZExtValue() == 9) {
AM.Scale = unsigned(CN->getZExtValue())-1;
- SDValue MulVal = N.getNode()->getOperand(0);
+ SDValue MulVal = N.getOperand(0);
SDValue Reg;
// Okay, we know that we have a scale by now. However, if the scaled
// value is an add of something and a constant, we can fold the
// constant into the disp field here.
if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
- isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
- Reg = MulVal.getNode()->getOperand(0);
+ isa<ConstantSDNode>(MulVal.getOperand(1))) {
+ Reg = MulVal.getOperand(0);
ConstantSDNode *AddVal =
- cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+ cast<ConstantSDNode>(MulVal.getOperand(1));
uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
if (foldOffsetIntoAddress(Disp, AM))
- Reg = N.getNode()->getOperand(0);
+ Reg = N.getOperand(0);
} else {
- Reg = N.getNode()->getOperand(0);
+ Reg = N.getOperand(0);
}
AM.IndexReg = AM.Base_Reg = Reg;
@@ -1289,7 +1286,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
// Test if the LHS of the sub can be folded.
X86ISelAddressMode Backup = AM;
- if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+ if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
AM = Backup;
break;
}
@@ -1300,7 +1297,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
}
int Cost = 0;
- SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+ SDValue RHS = Handle.getValue().getOperand(1);
// If the RHS involves a register with multiple uses, this
// transformation incurs an extra mov, due to the neg instruction
// clobbering its operand.
@@ -1309,7 +1306,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
- RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+ RHS.getOperand(0).getValueType() == MVT::i32))
++Cost;
// If the base is a register with multiple uses, this
// transformation may save a mov.
@@ -2524,7 +2521,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
N0.getNode()->hasOneUse() &&
N0.getValueType() != MVT::i8 &&
X86::isZeroNode(N1)) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!C) break;
// For example, convert "testl %eax, $8" to "testb %al, $8"
@@ -2532,7 +2529,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
(!(C->getZExtValue() & 0x80) ||
hasNoSignedComparisonUses(Node))) {
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
- SDValue Reg = N0.getNode()->getOperand(0);
+ SDValue Reg = N0.getOperand(0);
// On x86-32, only the ABCD registers have 8-bit subregisters.
if (!Subtarget->is64Bit()) {
@@ -2568,7 +2565,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Shift the immediate right by 8 bits.
SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
dl, MVT::i8);
- SDValue Reg = N0.getNode()->getOperand(0);
+ SDValue Reg = N0.getOperand(0);
// Put the value in an ABCD register.
const TargetRegisterClass *TRC;
@@ -2605,7 +2602,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
hasNoSignedComparisonUses(Node))) {
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
MVT::i16);
- SDValue Reg = N0.getNode()->getOperand(0);
+ SDValue Reg = N0.getOperand(0);
// Extract the 16-bit subregister.
SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
@@ -2628,7 +2625,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
hasNoSignedComparisonUses(Node))) {
SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
MVT::i32);
- SDValue Reg = N0.getNode()->getOperand(0);
+ SDValue Reg = N0.getOperand(0);
// Extract the 32-bit subregister.
SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9ee2234595f9..11c08292518a 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
@@ -79,6 +80,17 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
" of the loop header PC will be 0)."),
cl::Hidden);
+/// Call this when the user attempts to do something unsupported, like
+/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
+/// report_fatal_error, so calling code should attempt to recover without
+/// crashing.
+static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
+ const char *Msg) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ DAG.getContext()->diagnose(
+ DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
+}
+
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
@@ -1381,7 +1393,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
@@ -1445,8 +1457,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
- setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
@@ -1479,7 +1489,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
@@ -2207,15 +2217,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// or SSE or MMX vectors.
if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
- (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
- report_fatal_error("SSE register return with SSE disabled");
+ (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
+ errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ } else if (ValVT == MVT::f64 &&
+ (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
+ // Likewise we can't return F64 values with SSE1 only. gcc does so, but
+ // llvm-gcc has never done it right and no one has noticed, so this
+ // should be OK for now.
+ errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
- // Likewise we can't return F64 values with SSE1 only. gcc does so, but
- // llvm-gcc has never done it right and no one has noticed, so this
- // should be OK for now.
- if (ValVT == MVT::f64 &&
- (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
- report_fatal_error("SSE2 register return with SSE2 disabled");
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
// the RET instruction and handled by the FP Stackifier.
@@ -2528,7 +2540,8 @@ SDValue X86TargetLowering::LowerCallResult(
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
- report_fatal_error("SSE register return with SSE disabled");
+ errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -3415,8 +3428,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
if (!IsSibcall)
- Chain = DAG.getCALLSEQ_START(
- Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
+ NumBytes - NumBytesToPush, dl);
SDValue RetAddrFrIdx;
// Load return address for tail calls.
@@ -6912,9 +6925,9 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
if (IsSplat)
- return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
- DAG.getConstant(1, dl, VT),
- DAG.getConstant(0, dl, VT));
+ return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
+ DAG.getConstant(1, dl, VT),
+ DAG.getConstant(0, dl, VT));
// insert elements one by one
SDValue DstVec;
@@ -8386,9 +8399,9 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
Subtarget, DAG, DL);
SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
- return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
- DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
- ZeroVector);
+ return DAG.getSelect(DL, VT, VMask,
+ DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+ ZeroVector);
}
static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
@@ -8748,8 +8761,9 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
return DAG.getBitcast(
- VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
- DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
+ VT,
+ DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
+ V1, V2));
}
case MVT::v16f32:
case MVT::v8f64:
@@ -13817,6 +13831,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
return SDValue();
+ // If this VSELECT has a vector if i1 as a mask, it will be directly matched
+ // with patterns on the mask registers on AVX-512.
+ if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
+ return Op;
+
// Try to lower this to a blend-style vector shuffle. This can handle all
// constant condition cases.
if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
@@ -13826,10 +13845,30 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget.hasSSE41())
return SDValue();
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
+ // into an i1 condition so that we can use the mask-based 512-bit blend
+ // instructions.
+ if (VT.getSizeInBits() == 512) {
+ SDValue Cond = Op.getOperand(0);
+ // The vNi1 condition case should be handled above as it can be trivially
+ // lowered.
+ assert(Cond.getValueType().getScalarSizeInBits() ==
+ VT.getScalarSizeInBits() &&
+ "Should have a size-matched integer condition!");
+ // Build a mask by testing the condition against itself (tests for zero).
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
+ // Now return a new VSELECT using the mask.
+ return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
+ }
+
// Only some types will be legal on some subtargets. If we can emit a legal
// VSELECT-matching blend, return Op, and but if we need to expand, return
// a null value.
- switch (Op.getSimpleValueType().SimpleTy) {
+ switch (VT.SimpleTy) {
default:
// Most of the vector types have blends past SSE4.1.
return Op;
@@ -14725,7 +14764,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
// location.
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
SDValue Args[] = { Chain, Offset };
Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
@@ -15348,8 +15387,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
SDValue Zero = DAG.getIntPtrConstant(0, dl);
SDValue Four = DAG.getIntPtrConstant(4, dl);
- SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
- Zero, Four);
+ SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
// Load the value out, extending it from f32 to f80.
@@ -15621,7 +15659,7 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
SDValue Zero =
DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
- SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+ SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
if (VT == ExtVT)
return SelectedVal;
return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
@@ -16713,7 +16751,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
if (BitWidth > AndBitWidth) {
KnownBits Known;
DAG.computeKnownBits(Op0, Known);
- if (Known.Zero.countLeadingOnes() < BitWidth - AndBitWidth)
+ if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
return SDValue();
}
LHS = Op1;
@@ -17455,7 +17493,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
VCmp = DAG.getBitcast(VCmpVT, VCmp);
- SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+ SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
VSel, DAG.getIntPtrConstant(0, DL));
@@ -17483,9 +17521,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
Op2Scalar = Op2.getOperand(0);
if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
- SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
- Op1Scalar.getValueType(),
- Cond, Op1Scalar, Op2Scalar);
+ SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
+ Op1Scalar, Op2Scalar);
if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
return DAG.getBitcast(VT, newSelect);
SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
@@ -17500,8 +17537,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
- SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
- Cond, Op1, Op2);
+ SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
}
@@ -17770,7 +17806,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
} else {
SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
- V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+ V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
if (ExtVT == VT)
return V;
}
@@ -18572,7 +18608,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
bool Is64Bit = Subtarget.is64Bit();
MVT SPTy = getPointerTy(DAG.getDataLayout());
@@ -19021,8 +19057,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- if (isAllOnesConstant(Mask))
- return Op;
+
+ if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
+ if (MaskConst->getZExtValue() & 0x1)
+ return Op;
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -19081,7 +19119,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
// registration, or the .set_setframe offset.
MCSymbol *OffsetSym =
MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
- GlobalValue::getRealLinkageName(Fn->getName()));
+ GlobalValue::dropLLVMManglingEscape(Fn->getName()));
SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
SDValue ParentFrameOffset =
DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
@@ -19683,12 +19721,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
- case CONVERT_MASK_TO_VEC: {
- SDValue Mask = Op.getOperand(1);
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
- }
case BRCST_SUBVEC_TO_VEC: {
SDValue Src = Op.getOperand(1);
SDValue Passthru = Op.getOperand(2);
@@ -19932,7 +19964,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue Op1 = Op.getOperand(1);
auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
- GlobalValue::getRealLinkageName(Fn->getName()));
+ GlobalValue::dropLLVMManglingEscape(Fn->getName()));
// Generate a simple absolute symbol reference. This intrinsic is only
// supported on 32-bit Windows, which isn't PIC.
@@ -21741,6 +21773,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
SDValue Ex = DAG.getBitcast(ExVT, R);
+ // ashr(R, 63) === cmp_slt(R, 0)
+ if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
+ assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
+ "Unsupported PCMPGT op");
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT,
+ getZeroVector(VT, Subtarget, DAG, dl), R);
+ }
+
if (ShiftAmt >= 32) {
// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
SDValue Upper =
@@ -21839,10 +21879,19 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
}
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
(VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
(Subtarget.hasAVX512() && VT == MVT::v8i64))) {
+ // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
+ unsigned SubVectorScale = 1;
+ if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ SubVectorScale =
+ Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
+ Amt = Amt.getOperand(0);
+ }
+
// Peek through any splat that was introduced for i64 shift vectorization.
int SplatIndex = -1;
if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
@@ -21859,7 +21908,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
Amt = Amt.getOperand(0);
unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
- VT.getVectorNumElements();
+ (SubVectorScale * VT.getVectorNumElements());
unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
uint64_t ShiftAmt = 0;
unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
@@ -22233,23 +22282,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
- return DAG.getBitcast(SelVT,
- DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
} else if (Subtarget.hasSSE41()) {
// On SSE41 targets we make use of the fact that VSELECT lowers
// to PBLENDVB which selects bytes based just on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
- return DAG.getBitcast(SelVT,
- DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
}
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
- return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+ return DAG.getSelect(dl, SelVT, C, V0, V1);
};
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
@@ -22371,15 +22418,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
V0 = DAG.getBitcast(ExtVT, V0);
V1 = DAG.getBitcast(ExtVT, V1);
Sel = DAG.getBitcast(ExtVT, Sel);
- return DAG.getBitcast(
- VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+ return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
}
// On pre-SSE41 targets we splat the sign bit - a negative value will
// set all bits of the lanes to true and VSELECT uses that in
// its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue C =
DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
- return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+ return DAG.getSelect(dl, VT, C, V0, V1);
};
// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
@@ -23296,9 +23342,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
SDValue Callee =
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
- Type *RetTy = isF64
- ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
- : (Type*)VectorType::get(ArgTy, 4);
+ Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
+ : (Type *)VectorType::get(ArgTy, 4);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
@@ -25779,7 +25824,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
// Emit CALLSEQ_START right before the instruction.
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
MachineInstrBuilder CallseqStart =
- BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
+ BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
// Emit CALLSEQ_END right after the instruction.
@@ -26517,7 +26562,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
-
+
case TargetOpcode::PATCHABLE_EVENT_CALL:
// Do nothing here, handle in xray instrumentation pass.
return BB;
@@ -29532,7 +29577,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
DAG.getAllOnesConstant(DL, CondVT));
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
- return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+ return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
}
// To use the condition operand as a bitwise mask, it must have elements that
@@ -30015,7 +30060,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
Cond.getOperand(0), Cond.getOperand(1), NewCC);
- return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+ return DAG.getSelect(DL, VT, Cond, LHS, RHS);
}
}
}
@@ -31561,20 +31606,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// (sub (xor X, M), M)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- assert(N->getOpcode() == ISD::OR);
+ assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
- if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
+ if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (VT.is256BitVector() && Subtarget.hasInt256())))
return SDValue();
- assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
- // Canonicalize pandn to RHS
- if (N0.getOpcode() == X86ISD::ANDNP)
+ // Canonicalize AND to LHS.
+ if (N1.getOpcode() == ISD::AND)
std::swap(N0, N1);
+ // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
+ // ANDNP combine allows other combines to happen that prevent matching.
if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
return SDValue();
@@ -31596,20 +31643,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
Y = peekThroughBitcasts(Y);
EVT MaskVT = Mask.getValueType();
-
- // Validate that the Mask operand is a vector sra node.
- // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
- // there is no psrai.b
unsigned EltBits = MaskVT.getScalarSizeInBits();
- unsigned SraAmt = ~0;
- if (Mask.getOpcode() == ISD::SRA) {
- if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
- if (auto *AmtConst = AmtBV->getConstantSplatNode())
- SraAmt = AmtConst->getZExtValue();
- } else if (Mask.getOpcode() == X86ISD::VSRAI)
- SraAmt = Mask.getConstantOperandVal(1);
- if ((SraAmt + 1) != EltBits)
+ // TODO: Attempt to handle floating point cases as well?
+ if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
return SDValue();
SDLoc DL(N);
@@ -31630,7 +31667,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
// (add (xor X, M), (and M, 1))
// And further to:
// (sub (xor X, M), M)
- if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+ if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
+ DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
auto IsNegV = [](SDNode *N, SDValue V) {
return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
@@ -31642,9 +31680,6 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
V = Y;
if (V) {
- if (EltBits != 8 && EltBits != 16 && EltBits != 32)
- return SDValue();
-
SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
SDValue SubOp2 = Mask;
@@ -31661,8 +31696,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
if (V == Y)
std::swap(SubOp1, SubOp2);
- return DAG.getBitcast(VT,
- DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
+ SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+ return DAG.getBitcast(VT, Res);
}
}
@@ -31675,7 +31710,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
X = DAG.getBitcast(BlendVT, X);
Y = DAG.getBitcast(BlendVT, Y);
Mask = DAG.getBitcast(BlendVT, Mask);
- Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+ Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
return DAG.getBitcast(VT, Mask);
}
@@ -33655,8 +33690,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
// are NaN, the NaN value of Op1 is the result.
- auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
- return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+ return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
}
/// Do target-specific dag combines on X86ISD::ANDNP nodes.
@@ -33949,7 +33983,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (InVT == MVT::i1) {
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
- return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+ return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
}
return SDValue();
}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 3dc673e3c35a..d003d027ddb9 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -43,7 +43,8 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber EFLAGS.
let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
+ (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
"#ADJCALLSTACKDOWN",
[]>,
Requires<[NotLP64]>;
@@ -52,8 +53,8 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
[(X86callseq_end timm:$amt1, timm:$amt2)]>,
Requires<[NotLP64]>;
}
-def : Pat<(X86callseq_start timm:$amt1),
- (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+ (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;
// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
@@ -62,7 +63,8 @@ def : Pat<(X86callseq_start timm:$amt1),
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber EFLAGS.
let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
+ (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
"#ADJCALLSTACKDOWN",
[]>,
Requires<[IsLP64]>;
@@ -71,8 +73,8 @@ def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
[(X86callseq_end timm:$amt1, timm:$amt2)]>,
Requires<[IsLP64]>;
}
-def : Pat<(X86callseq_start timm:$amt1),
- (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+ (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;
// x86-64 va_start lowering magic.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 888daa275265..092ceb207ada 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -5729,6 +5729,44 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
}
}
+std::pair<X86::CondCode, bool>
+X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
+ X86::CondCode CC = X86::COND_INVALID;
+ bool NeedSwap = false;
+ switch (Predicate) {
+ default: break;
+ // Floating-point Predicates
+ case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
+ case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
+ case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
+ case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
+ case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
+ case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
+ case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH;
+ case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+ // Integer Predicates
+ case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
+ case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
+ case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
+ case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
+ case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
+ case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
+ case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
+ case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
+ case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
+ case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
+ }
+
+ return std::make_pair(CC, NeedSwap);
+}
+
/// Return a set opcode for the given condition and
/// whether it has memory operand.
unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
@@ -7589,6 +7627,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
case X86::AVX2_SETALLONES:
return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+ case X86::AVX1_SETALLONES: {
+ unsigned Reg = MIB->getOperand(0).getReg();
+ // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
+ MIB->setDesc(get(X86::VCMPPSYrri));
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
+ return true;
+ }
case X86::AVX512_512_SETALLONES: {
unsigned Reg = MIB->getOperand(0).getReg();
MIB->setDesc(get(X86::VPTERNLOGDZrri));
@@ -8477,6 +8522,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Alignment = 64;
break;
case X86::AVX2_SETALLONES:
+ case X86::AVX1_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_256_SET0:
Alignment = 32;
@@ -8522,6 +8568,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::V_SET0:
case X86::V_SETALLONES:
case X86::AVX2_SETALLONES:
+ case X86::AVX1_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_128_SET0:
case X86::AVX512_256_SET0:
@@ -8563,13 +8610,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
- Opc == X86::AVX512_256_SET0)
+ Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
else
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
- Opc == X86::AVX512_512_SETALLONES);
+ Opc == X86::AVX512_512_SETALLONES ||
+ Opc == X86::AVX1_SETALLONES);
const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
Constant::getNullValue(Ty);
unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 38567831b3a4..e64876073ccf 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -64,6 +64,10 @@ enum CondCode {
// Turn condition code into conditional branch opcode.
unsigned GetCondBranchFromCond(CondCode CC);
+/// \brief Return a pair of condition code for the given predicate and whether
+/// the instruction operands should be swaped to match the condition code.
+std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
+
/// \brief Return a set opcode for the given condition and whether it has
/// a memory operand.
unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
@@ -186,6 +190,8 @@ public:
/// setup..destroy sequence (e.g. by pushes, or inside the callee).
int64_t getFrameAdjustment(const MachineInstr &I) const {
assert(isFrameInstr(I));
+ if (isFrameSetup(I))
+ return I.getOperand(2).getImm();
return I.getOperand(1).getImm();
}
@@ -193,7 +199,10 @@ public:
/// instruction.
void setFrameAdjustment(MachineInstr &I, int64_t V) const {
assert(isFrameInstr(I));
- I.getOperand(1).setImm(V);
+ if (isFrameSetup(I))
+ I.getOperand(2).setImm(V);
+ else
+ I.getOperand(1).setImm(V);
}
/// getSPAdjust - This returns the stack pointer adjustment made by
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 902b0c2c04e3..4d7d8ece92d9 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -84,7 +84,8 @@ def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
-def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
SDTCisVT<1, i32>]>;
@@ -2351,6 +2352,38 @@ let Predicates = [HasBMI2] in {
def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
(BZHI64rm addr:$src,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+ // x & (-1 >> (32 - y))
+ def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
+ (BZHI32rr GR32:$src, GR32:$lz)>;
+ def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
+ (BZHI32rm addr:$src, GR32:$lz)>;
+
+ // x & (-1 >> (64 - y))
+ def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+ def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
+ (BZHI64rm addr:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+
+ // x << (32 - y) >> (32 - y)
+ def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))),
+ (i8 (trunc (sub 32, GR32:$lz)))),
+ (BZHI32rr GR32:$src, GR32:$lz)>;
+ def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))),
+ (i8 (trunc (sub 32, GR32:$lz)))),
+ (BZHI32rm addr:$src, GR32:$lz)>;
+
+ // x << (64 - y) >> (64 - y)
+ def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))),
+ (i8 (trunc (sub 64, GR32:$lz)))),
+ (BZHI64rr GR64:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+ def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))),
+ (i8 (trunc (sub 64, GR32:$lz)))),
+ (BZHI64rm addr:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
} // HasBMI2
let Predicates = [HasBMI] in {
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 48da2fa607af..f73d85e7e01b 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -486,6 +486,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero] in {
def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>;
+ let Predicates = [HasAVX1Only, OptForMinSize] in {
+ def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+ }
let Predicates = [HasAVX2] in
def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8i32 immAllOnesV))]>;
@@ -7755,14 +7759,12 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
[]>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
}
-
-// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit
-// all ones value.
-let Predicates = [HasAVX1Only] in
-def : Pat<(v8i32 immAllOnesV),
- (VINSERTF128rr
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm),
- (V_SETALLONES), 1)>;
+// To create a 256-bit all ones value, we should produce VCMPTRUEPS
+// with YMM register containing zero.
+// FIXME: Avoid producing vxorps to clear the fake inputs.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
+}
multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
PatFrag memop_frag> {
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index d65eb1de8d09..de58d719acb4 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -56,13 +56,9 @@ private:
bool selectImpl(MachineInstr &I) const;
// TODO: remove after suported by Tablegen-erated instruction selection.
- unsigned getFAddOp(LLT &Ty, const RegisterBank &RB) const;
- unsigned getFSubOp(LLT &Ty, const RegisterBank &RB) const;
unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc,
uint64_t Alignment) const;
- bool selectBinaryOp(MachineInstr &I, MachineRegisterInfo &MRI,
- MachineFunction &MF) const;
bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -71,6 +67,10 @@ private:
MachineFunction &MF) const;
bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
+ bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
const X86TargetMachine &TM;
const X86Subtarget &STI;
@@ -226,13 +226,11 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
"Generic instruction has unexpected implicit operands\n");
if (selectImpl(I))
- return true;
+ return true;
DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
// TODO: This should be implemented by tblgen.
- if (selectBinaryOp(I, MRI, MF))
- return true;
if (selectLoadStoreOp(I, MRI, MF))
return true;
if (selectFrameIndexOrGep(I, MRI, MF))
@@ -241,109 +239,14 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
return true;
if (selectTrunc(I, MRI, MF))
return true;
+ if (selectZext(I, MRI, MF))
+ return true;
+ if (selectCmp(I, MRI, MF))
+ return true;
return false;
}
-unsigned X86InstructionSelector::getFAddOp(LLT &Ty,
- const RegisterBank &RB) const {
-
- if (X86::VECRRegBankID != RB.getID())
- return TargetOpcode::G_FADD;
-
- if (Ty == LLT::scalar(32)) {
- if (STI.hasAVX512()) {
- return X86::VADDSSZrr;
- } else if (STI.hasAVX()) {
- return X86::VADDSSrr;
- } else if (STI.hasSSE1()) {
- return X86::ADDSSrr;
- }
- } else if (Ty == LLT::scalar(64)) {
- if (STI.hasAVX512()) {
- return X86::VADDSDZrr;
- } else if (STI.hasAVX()) {
- return X86::VADDSDrr;
- } else if (STI.hasSSE2()) {
- return X86::ADDSDrr;
- }
- } else if (Ty == LLT::vector(4, 32)) {
- if ((STI.hasAVX512()) && (STI.hasVLX())) {
- return X86::VADDPSZ128rr;
- } else if (STI.hasAVX()) {
- return X86::VADDPSrr;
- } else if (STI.hasSSE1()) {
- return X86::ADDPSrr;
- }
- }
-
- return TargetOpcode::G_FADD;
-}
-
-unsigned X86InstructionSelector::getFSubOp(LLT &Ty,
- const RegisterBank &RB) const {
-
- if (X86::VECRRegBankID != RB.getID())
- return TargetOpcode::G_FSUB;
-
- if (Ty == LLT::scalar(32)) {
- if (STI.hasAVX512()) {
- return X86::VSUBSSZrr;
- } else if (STI.hasAVX()) {
- return X86::VSUBSSrr;
- } else if (STI.hasSSE1()) {
- return X86::SUBSSrr;
- }
- } else if (Ty == LLT::scalar(64)) {
- if (STI.hasAVX512()) {
- return X86::VSUBSDZrr;
- } else if (STI.hasAVX()) {
- return X86::VSUBSDrr;
- } else if (STI.hasSSE2()) {
- return X86::SUBSDrr;
- }
- } else if (Ty == LLT::vector(4, 32)) {
- if ((STI.hasAVX512()) && (STI.hasVLX())) {
- return X86::VSUBPSZ128rr;
- } else if (STI.hasAVX()) {
- return X86::VSUBPSrr;
- } else if (STI.hasSSE1()) {
- return X86::SUBPSrr;
- }
- }
-
- return TargetOpcode::G_FSUB;
-}
-
-bool X86InstructionSelector::selectBinaryOp(MachineInstr &I,
- MachineRegisterInfo &MRI,
- MachineFunction &MF) const {
-
- const unsigned DefReg = I.getOperand(0).getReg();
- LLT Ty = MRI.getType(DefReg);
- const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
-
- unsigned NewOpc = I.getOpcode();
-
- switch (NewOpc) {
- case TargetOpcode::G_FADD:
- NewOpc = getFAddOp(Ty, RB);
- break;
- case TargetOpcode::G_FSUB:
- NewOpc = getFSubOp(Ty, RB);
- break;
- default:
- break;
- }
-
- if (NewOpc == I.getOpcode())
- return false;
-
- I.setDesc(TII.get(NewOpc));
-
- return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-}
-
unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
unsigned Opc,
uint64_t Alignment) const {
@@ -562,6 +465,105 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
return true;
}
+bool X86InstructionSelector::selectZext(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ if (I.getOpcode() != TargetOpcode::G_ZEXT)
+ return false;
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+
+ if (SrcTy == LLT::scalar(1)) {
+
+ unsigned AndOpc;
+ if (DstTy == LLT::scalar(32))
+ AndOpc = X86::AND32ri8;
+ else if (DstTy == LLT::scalar(64))
+ AndOpc = X86::AND64ri8;
+ else
+ return false;
+
+ const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+ unsigned DefReg =
+ MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank));
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
+ .addImm(0)
+ .addReg(SrcReg)
+ .addImm(X86::sub_8bit);
+
+ MachineInstr &AndInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg)
+ .addReg(DefReg)
+ .addImm(1);
+
+ constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+bool X86InstructionSelector::selectCmp(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ if (I.getOpcode() != TargetOpcode::G_ICMP)
+ return false;
+
+ X86::CondCode CC;
+ bool SwapArgs;
+ std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
+ (CmpInst::Predicate)I.getOperand(1).getPredicate());
+ unsigned OpSet = X86::getSETFromCond(CC);
+
+ unsigned LHS = I.getOperand(2).getReg();
+ unsigned RHS = I.getOperand(3).getReg();
+
+ if (SwapArgs)
+ std::swap(LHS, RHS);
+
+ unsigned OpCmp;
+ LLT Ty = MRI.getType(LHS);
+
+ switch (Ty.getSizeInBits()) {
+ default:
+ return false;
+ case 8:
+ OpCmp = X86::CMP8rr;
+ break;
+ case 16:
+ OpCmp = X86::CMP16rr;
+ break;
+ case 32:
+ OpCmp = X86::CMP32rr;
+ break;
+ case 64:
+ OpCmp = X86::CMP64rr;
+ break;
+ }
+
+ MachineInstr &CmpInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp))
+ .addReg(LHS)
+ .addReg(RHS);
+
+ MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(OpSet), I.getOperand(0).getReg());
+
+ constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
+ constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+}
+
InstructionSelector *
llvm::createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 2a40399ba571..bc73bb1ae8c5 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
EXPAND_FROM_MEM,
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
- FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
+ FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
};
struct IntrinsicData {
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 4f5e70414aa9..cf26238c0239 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -87,10 +87,16 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_ZEXT, s32}, Legal);
setAction({G_SEXT, s32}, Legal);
- for (auto Ty : {s8, s16}) {
+ for (auto Ty : {s1, s8, s16}) {
setAction({G_ZEXT, 1, Ty}, Legal);
setAction({G_SEXT, 1, Ty}, Legal);
}
+
+ // Comparison
+ setAction({G_ICMP, s1}, Legal);
+
+ for (auto Ty : {s8, s16, s32, p0})
+ setAction({G_ICMP, 1, Ty}, Legal);
}
void X86LegalizerInfo::setLegalizerInfo64bit() {
@@ -139,10 +145,16 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
setAction({G_SEXT, Ty}, Legal);
}
- for (auto Ty : {s8, s16, s32}) {
+ for (auto Ty : {s1, s8, s16, s32}) {
setAction({G_ZEXT, 1, Ty}, Legal);
setAction({G_SEXT, 1, Ty}, Legal);
}
+
+ // Comparison
+ setAction({G_ICMP, s1}, Legal);
+
+ for (auto Ty : {s8, s16, s32, s64, p0})
+ setAction({G_ICMP, 1, Ty}, Legal);
}
void X86LegalizerInfo::setLegalizerInfoSSE1() {
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index cf2ceef8013a..7e4cba1c8345 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -320,14 +320,14 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
case CallingConv::X86_RegCall:
if (Is64Bit) {
if (IsWin64) {
- return (HasSSE ? CSR_Win64_RegCall_SaveList :
+ return (HasSSE ? CSR_Win64_RegCall_SaveList :
CSR_Win64_RegCall_NoSSE_SaveList);
} else {
- return (HasSSE ? CSR_SysV64_RegCall_SaveList :
+ return (HasSSE ? CSR_SysV64_RegCall_SaveList :
CSR_SysV64_RegCall_NoSSE_SaveList);
}
} else {
- return (HasSSE ? CSR_32_RegCall_SaveList :
+ return (HasSSE ? CSR_32_RegCall_SaveList :
CSR_32_RegCall_NoSSE_SaveList);
}
case CallingConv::Cold:
@@ -435,15 +435,15 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return CSR_64_HHVM_RegMask;
case CallingConv::X86_RegCall:
if (Is64Bit) {
- if (IsWin64) {
- return (HasSSE ? CSR_Win64_RegCall_RegMask :
+ if (IsWin64) {
+ return (HasSSE ? CSR_Win64_RegCall_RegMask :
CSR_Win64_RegCall_NoSSE_RegMask);
} else {
- return (HasSSE ? CSR_SysV64_RegCall_RegMask :
+ return (HasSSE ? CSR_SysV64_RegCall_RegMask :
CSR_SysV64_RegCall_NoSSE_RegMask);
}
} else {
- return (HasSSE ? CSR_32_RegCall_RegMask :
+ return (HasSSE ? CSR_32_RegCall_RegMask :
CSR_32_RegCall_NoSSE_RegMask);
}
case CallingConv::Cold:
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index de1514243aeb..02be95e2e556 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -253,6 +253,11 @@ protected:
/// True if the LEA instruction with certain arguments is slow
bool SlowLEA;
+ /// True if the LEA instruction has all three source operands: base, index,
+ /// and offset or if the LEA instruction uses base and index registers where
+ /// the base is EBP, RBP,or R13
+ bool Slow3OpsLEA;
+
/// True if INC and DEC instructions are slow when writing to flags
bool SlowIncDec;
@@ -490,6 +495,7 @@ public:
bool callRegIndirect() const { return CallRegIndirect; }
bool LEAusesAG() const { return LEAUsesAG; }
bool slowLEA() const { return SlowLEA; }
+ bool slow3OpsLEA() const { return Slow3OpsLEA; }
bool slowIncDec() const { return SlowIncDec; }
bool hasCDI() const { return HasCDI; }
bool hasPFI() const { return HasPFI; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 086f55dd60b5..c6a90725d89c 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -61,6 +61,7 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
namespace llvm {
void initializeWinEHStatePassPass(PassRegistry &);
+void initializeFixupLEAPassPass(PassRegistry &);
void initializeX86ExecutionDepsFixPass(PassRegistry &);
} // end namespace llvm
@@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() {
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
initializeEvexToVexInstPassPass(PR);
+ initializeFixupLEAPassPass(PR);
initializeX86ExecutionDepsFixPass(PR);
}
@@ -87,7 +89,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
if (TT.isOSFreeBSD())
return llvm::make_unique<X86FreeBSDTargetObjectFile>();
- if (TT.isOSLinux() || TT.isOSNaCl())
+ if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
if (TT.isOSFuchsia())
return llvm::make_unique<X86FuchsiaTargetObjectFile>();
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index f3b619a2956a..80e18161a94b 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -247,35 +247,38 @@ int X86TTIImpl::getArithmeticInstrCost(
}
static const CostTblEntry SSE2UniformConstCostTable[] = {
- { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
- { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
- { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
-
- { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand).
- { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand).
- { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb).
-
- { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
- { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
- { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
- { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
- { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence
- { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
- { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence
- { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+ { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
+ { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
+ { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+
+ { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasSSE2()) {
// pmuldq sequence.
if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
- return LT.first * 30;
+ return LT.first * 32;
if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
return LT.first * 15;
- if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
- LT.second))
- return LT.first * Entry->Cost;
+ // XOP has faster vXi8 shifts.
+ if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
+ !ST->hasXOP())
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
}
static const CostTblEntry AVX2UniformCostTable[] = {
@@ -430,18 +433,18 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRL, MVT::v2i64, 2 },
{ ISD::SRA, MVT::v2i64, 2 },
// 256bit shifts require splitting if AVX2 didn't catch them above.
- { ISD::SHL, MVT::v32i8, 2 },
- { ISD::SRL, MVT::v32i8, 4 },
- { ISD::SRA, MVT::v32i8, 4 },
- { ISD::SHL, MVT::v16i16, 2 },
- { ISD::SRL, MVT::v16i16, 4 },
- { ISD::SRA, MVT::v16i16, 4 },
- { ISD::SHL, MVT::v8i32, 2 },
- { ISD::SRL, MVT::v8i32, 4 },
- { ISD::SRA, MVT::v8i32, 4 },
- { ISD::SHL, MVT::v4i64, 2 },
- { ISD::SRL, MVT::v4i64, 4 },
- { ISD::SRA, MVT::v4i64, 4 },
+ { ISD::SHL, MVT::v32i8, 2+2 },
+ { ISD::SRL, MVT::v32i8, 4+2 },
+ { ISD::SRA, MVT::v32i8, 4+2 },
+ { ISD::SHL, MVT::v16i16, 2+2 },
+ { ISD::SRL, MVT::v16i16, 4+2 },
+ { ISD::SRA, MVT::v16i16, 4+2 },
+ { ISD::SHL, MVT::v8i32, 2+2 },
+ { ISD::SRL, MVT::v8i32, 4+2 },
+ { ISD::SRA, MVT::v8i32, 4+2 },
+ { ISD::SHL, MVT::v4i64, 2+2 },
+ { ISD::SRL, MVT::v4i64, 4+2 },
+ { ISD::SRA, MVT::v4i64, 4+2 },
};
// Look for XOP lowering tricks.
@@ -451,23 +454,28 @@ int X86TTIImpl::getArithmeticInstrCost(
static const CostTblEntry SSE2UniformShiftCostTable[] = {
// Uniform splats are cheaper for the following instructions.
- { ISD::SHL, MVT::v16i16, 2 }, // psllw.
- { ISD::SHL, MVT::v8i32, 2 }, // pslld
- { ISD::SHL, MVT::v4i64, 2 }, // psllq.
-
- { ISD::SRL, MVT::v16i16, 2 }, // psrlw.
- { ISD::SRL, MVT::v8i32, 2 }, // psrld.
- { ISD::SRL, MVT::v4i64, 2 }, // psrlq.
-
- { ISD::SRA, MVT::v16i16, 2 }, // psraw.
- { ISD::SRA, MVT::v8i32, 2 }, // psrad.
- { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
- { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
+ { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
+ { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
+ { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
+
+ { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
+ { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
+ { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
+
+ { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
+ { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
+ { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
+ { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
};
if (ST->hasSSE2() &&
((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
(Op2Info == TargetTransformInfo::OK_UniformValue))) {
+
+ // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
+ if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
+ return LT.first * 4; // 2*psrad + shuffle.
+
if (const auto *Entry =
CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
@@ -581,28 +589,28 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
static const CostTblEntry SSE41CostTable[] = {
- { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
- { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence.
- { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence.
- { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
- { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld
-
- { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
- { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence.
- { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SRL, MVT::v16i16, 2*14 }, // pblendvb sequence.
- { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v8i32, 2*11 }, // Shift each lane + blend.
-
- { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
- { ISD::SRA, MVT::v32i8, 2*24 }, // pblendvb sequence.
- { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence.
- { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend.
-
- { ISD::MUL, MVT::v4i32, 1 } // pmulld
+ { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
+ { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
+
+ { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
+
+ { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
+
+ { ISD::MUL, MVT::v4i32, 1 } // pmulld
};
if (ST->hasSSE41())
@@ -612,33 +620,33 @@ int X86TTIImpl::getArithmeticInstrCost(
static const CostTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
- { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
- { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
- { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
-
- { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
- { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
-
- { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
- { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
-
- { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v8i16, 1 }, // pmullw
- { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
- { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
-
- { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
- { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
- { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
- { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
+ { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
+
+ { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
+
+ { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
+
+ { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v8i16, 1 }, // pmullw
+ { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
+ { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
// It is not a good idea to vectorize division. We have to scalarize it and
// in the process we will often end up having to spilling regular
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 500b26b3be17..3ee14a0ff7b1 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -398,7 +398,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
/*isVarArg=*/false);
Function *Trampoline =
Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
- Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+ Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape(
ParentFunc->getName()),
TheModule);
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index b8742683a0c8..1da189c5cd31 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -409,7 +409,7 @@ static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
{
KnownBits Known;
DAG.computeKnownBits(Value, Known);
- return Known.Zero.countTrailingOnes() >= 2;
+ return Known.countMinTrailingZeros() >= 2;
}
SDValue XCoreTargetLowering::
@@ -1131,8 +1131,7 @@ SDValue XCoreTargetLowering::LowerCCCCallTo(
unsigned NumBytes = RetCCInfo.getNextStackOffset();
auto PtrVT = getPointerTy(DAG.getDataLayout());
- Chain = DAG.getCALLSEQ_START(Chain,
- DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
SmallVector<SDValue, 12> MemOpChains;
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index f1d52d5a191f..b87ba6548962 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -73,9 +73,10 @@ def XCoreLdwsp : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp,
[SDNPHasChain, SDNPMayLoad]>;
// These are target-independent nodes, but have target-specific formats.
-def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
def SDT_XCoreCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
- SDTCisVT<1, i32> ]>;
+ SDTCisVT<1, i32> ]>;
def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
[SDNPHasChain, SDNPOutGlue]>;
@@ -323,9 +324,9 @@ class F2R_np<bits<6> opc, string OpcStr> :
//===----------------------------------------------------------------------===//
let Defs = [SP], Uses = [SP] in {
-def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
- "# ADJCALLSTACKDOWN $amt",
- [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt, i32imm:$amt2),
+ "# ADJCALLSTACKDOWN $amt, $amt2",
+ [(callseq_start timm:$amt, timm:$amt2)]>;
def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
"# ADJCALLSTACKUP $amt1",
[(callseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/lib/ToolDrivers/CMakeLists.txt b/lib/ToolDrivers/CMakeLists.txt
new file mode 100644
index 000000000000..ad458450fda3
--- /dev/null
+++ b/lib/ToolDrivers/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(llvm-lib)
diff --git a/lib/ToolDrivers/LLVMBuild.txt b/lib/ToolDrivers/LLVMBuild.txt
new file mode 100644
index 000000000000..7da9a5c01005
--- /dev/null
+++ b/lib/ToolDrivers/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/ToolDrivers/LLVMBuild.txt --------------------------*- Conf -*--===;
+;
+; The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+; http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = llvm-lib
+
+[component_0]
+type = Group
+name = ToolDrivers
+parent = Libraries
diff --git a/lib/LibDriver/CMakeLists.txt b/lib/ToolDrivers/llvm-lib/CMakeLists.txt
index ab53a6843446..ab53a6843446 100644
--- a/lib/LibDriver/CMakeLists.txt
+++ b/lib/ToolDrivers/llvm-lib/CMakeLists.txt
diff --git a/lib/LibDriver/LLVMBuild.txt b/lib/ToolDrivers/llvm-lib/LLVMBuild.txt
index 799dc997c0bb..799dc997c0bb 100644
--- a/lib/LibDriver/LLVMBuild.txt
+++ b/lib/ToolDrivers/llvm-lib/LLVMBuild.txt
diff --git a/lib/LibDriver/LibDriver.cpp b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index c50629d71501..3bae3826d62e 100644
--- a/lib/LibDriver/LibDriver.cpp
+++ b/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -12,7 +12,7 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/LibDriver/LibDriver.h"
+#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Object/ArchiveWriter.h"
#include "llvm/Option/Arg.h"
diff --git a/lib/LibDriver/Options.td b/lib/ToolDrivers/llvm-lib/Options.td
index 5a56ef7468d4..5a56ef7468d4 100644
--- a/lib/LibDriver/Options.td
+++ b/lib/ToolDrivers/llvm-lib/Options.td
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 19e6789dfa74..4480220f2cd4 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -177,7 +177,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
// consume. Note, that crossing coro.save also requires a spill, as any code
// between coro.save and coro.suspend may resume the coroutine and all of the
// state needs to be saved by that time.
- auto markSuspendBlock = [&](IntrinsicInst* BarrierInst) {
+ auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) {
BasicBlock *SuspendBlock = BarrierInst->getParent();
auto &B = getBlockData(SuspendBlock);
B.Suspend = true;
@@ -495,6 +495,78 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
return FramePtr;
}
+// Sets the unwind edge of an instruction to a particular successor.
+static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) {
+ if (auto *II = dyn_cast<InvokeInst>(TI))
+ II->setUnwindDest(Succ);
+ else if (auto *CS = dyn_cast<CatchSwitchInst>(TI))
+ CS->setUnwindDest(Succ);
+ else if (auto *CR = dyn_cast<CleanupReturnInst>(TI))
+ CR->setUnwindDest(Succ);
+ else
+ llvm_unreachable("unexpected terminator instruction");
+}
+
+// Replaces all uses of OldPred with the NewPred block in all PHINodes in a
+// block.
+static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred,
+ BasicBlock *NewPred,
+ PHINode *LandingPadReplacement) {
+ unsigned BBIdx = 0;
+ for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PN = cast<PHINode>(I);
+
+ // We manually update the LandingPadReplacement PHINode and it is the last
+ // PHI Node. So, if we find it, we are done.
+ if (LandingPadReplacement == PN)
+ break;
+
+ // Reuse the previous value of BBIdx if it lines up. In cases where we
+ // have multiple phi nodes with *lots* of predecessors, this is a speed
+ // win because we don't have to scan the PHI looking for TIBB. This
+ // happens because the BB list of PHI nodes are usually in the same
+ // order.
+ if (PN->getIncomingBlock(BBIdx) != OldPred)
+ BBIdx = PN->getBasicBlockIndex(OldPred);
+
+ assert(BBIdx != (unsigned)-1 && "Invalid PHI Index!");
+ PN->setIncomingBlock(BBIdx, NewPred);
+ }
+}
+
+// Uses SplitEdge unless the successor block is an EHPad, in which case do EH
+// specific handling.
+static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
+ LandingPadInst *OriginalPad,
+ PHINode *LandingPadReplacement) {
+ auto *PadInst = Succ->getFirstNonPHI();
+ if (!LandingPadReplacement && !PadInst->isEHPad())
+ return SplitEdge(BB, Succ);
+
+ auto *NewBB = BasicBlock::Create(BB->getContext(), "", BB->getParent(), Succ);
+ setUnwindEdgeTo(BB->getTerminator(), NewBB);
+ updatePhiNodes(Succ, BB, NewBB, LandingPadReplacement);
+
+ if (LandingPadReplacement) {
+ auto *NewLP = OriginalPad->clone();
+ auto *Terminator = BranchInst::Create(Succ, NewBB);
+ NewLP->insertBefore(Terminator);
+ LandingPadReplacement->addIncoming(NewLP, NewBB);
+ return NewBB;
+ }
+ Value *ParentPad = nullptr;
+ if (auto *FuncletPad = dyn_cast<FuncletPadInst>(PadInst))
+ ParentPad = FuncletPad->getParentPad();
+ else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(PadInst))
+ ParentPad = CatchSwitch->getParentPad();
+ else
+ llvm_unreachable("handling for other EHPads not implemented yet");
+
+ auto *NewCleanupPad = CleanupPadInst::Create(ParentPad, {}, "", NewBB);
+ CleanupReturnInst::Create(NewCleanupPad, Succ, NewBB);
+ return NewBB;
+}
+
static void rewritePHIs(BasicBlock &BB) {
// For every incoming edge we will create a block holding all
// incoming values in a single PHI nodes.
@@ -502,7 +574,7 @@ static void rewritePHIs(BasicBlock &BB) {
// loop:
// %n.val = phi i32[%n, %entry], [%inc, %loop]
//
- // It will create:
+ // It will create:
//
// loop.from.entry:
// %n.loop.pre = phi i32 [%n, %entry]
@@ -517,9 +589,22 @@ static void rewritePHIs(BasicBlock &BB) {
// TODO: Simplify PHINodes in the basic block to remove duplicate
// predecessors.
+ LandingPadInst *LandingPad = nullptr;
+ PHINode *ReplPHI = nullptr;
+ if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
+ // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
+ // We replace the original landing pad with a PHINode that will collect the
+ // results from all of them.
+ ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad);
+ ReplPHI->takeName(LandingPad);
+ LandingPad->replaceAllUsesWith(ReplPHI);
+ // We will erase the original landing pad at the end of this function after
+ // ehAwareSplitEdge cloned it in the transition blocks.
+ }
+
SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));
for (BasicBlock *Pred : Preds) {
- auto *IncomingBB = SplitEdge(Pred, &BB);
+ auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
auto *PN = cast<PHINode>(&BB.front());
do {
@@ -531,7 +616,14 @@ static void rewritePHIs(BasicBlock &BB) {
InputV->addIncoming(V, Pred);
PN->setIncomingValue(Index, InputV);
PN = dyn_cast<PHINode>(PN->getNextNode());
- } while (PN);
+ } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced
+ // the landing pad.
+ }
+
+ if (LandingPad) {
+ // Calls to ehAwareSplitEdge function cloned the original lading pad.
+ // No longer need it.
+ LandingPad->eraseFromParent();
}
}
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 7ed07d63c627..231487923fad 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -610,8 +610,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
return true;
// Lookup the linkage recorded in the summaries during global analysis.
- const auto &GS = DefinedGlobals.find(GV.getGUID());
- GlobalValue::LinkageTypes Linkage;
+ auto GS = DefinedGlobals.find(GV.getGUID());
if (GS == DefinedGlobals.end()) {
// Must have been promoted (possibly conservatively). Find original
// name so that we can access the correct summary and see if it can
@@ -623,7 +622,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
std::string OrigId = GlobalValue::getGlobalIdentifier(
OrigName, GlobalValue::InternalLinkage,
TheModule.getSourceFileName());
- const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
+ GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
if (GS == DefinedGlobals.end()) {
// Also check the original non-promoted non-globalized name. In some
// cases a preempted weak value is linked in as a local copy because
@@ -631,15 +630,11 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
// In that case, since it was originally not a local value, it was
// recorded in the index using the original name.
// FIXME: This may not be needed once PR27866 is fixed.
- const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
+ GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
assert(GS != DefinedGlobals.end());
- Linkage = GS->second->linkage();
- } else {
- Linkage = GS->second->linkage();
}
- } else
- Linkage = GS->second->linkage();
- return !GlobalValue::isLocalLinkage(Linkage);
+ }
+ return !GlobalValue::isLocalLinkage(GS->second->linkage());
};
// FIXME: See if we can just internalize directly here via linkage changes
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 6c83c99ae3be..673d3af0ab52 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -502,7 +502,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
InlinedArrayAllocasTy InlinedArrayAllocas;
- InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache);
+ InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI);
// Now that we have all of the call sites, loop over them and inline them if
// it looks profitable to do so.
@@ -872,7 +872,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
// Setup the data structure used to plumb customization into the
// `InlineFunction` routine.
InlineFunctionInfo IFI(
- /*cg=*/nullptr, &GetAssumptionCache,
+ /*cg=*/nullptr, &GetAssumptionCache, PSI,
&FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
&FAM.getResult<BlockFrequencyAnalysis>(Callee));
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 2db47b3b5622..8dff2fb3be8a 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined,
static cl::opt<bool>
DisablePartialInlining("disable-partial-inlining", cl::init(false),
cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+ cl::init(false), cl::ZeroOrMore,
+ cl::ReallyHidden,
+ cl::desc("Skip Cost Analysis"));
static cl::opt<unsigned> MaxNumInlineBlocks(
"max-num-inline-blocks", cl::init(5), cl::Hidden,
@@ -53,6 +59,15 @@ static cl::opt<int> MaxNumPartialInlining(
"max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
cl::desc("Max number of partial inlining. The default is unlimited"));
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+ OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+ cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Relative frequency of outline region to "
+ "the entry block"));
+
namespace {
struct FunctionOutliningInfo {
@@ -84,8 +99,6 @@ struct PartialInlinerImpl {
bool run(Module &M);
Function *unswitchFunction(Function *F);
- std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
-
private:
int NumPartialInlining = 0;
std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -93,11 +106,84 @@ private:
Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
ProfileSummaryInfo *PSI;
- bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE);
+ // Return the frequency of the OutlininingBB relative to F's entry point.
+ // The result is no larger than 1 and is represented using BP.
+ // (Note that the outlined region's 'head' block can only have incoming
+ // edges from the guarding entry blocks).
+ BranchProbability getOutliningCallBBRelativeFreq(Function *F,
+ FunctionOutliningInfo *OI,
+ Function *DuplicateFunction,
+ BlockFrequencyInfo *BFI,
+ BasicBlock *OutliningCallBB);
+
+ // Return true if the callee of CS should be partially inlined with
+ // profit.
+ bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI,
+ BlockFrequencyInfo *CalleeBFI,
+ BasicBlock *OutliningCallBB,
+ int OutliningCallOverhead,
+ OptimizationRemarkEmitter &ORE);
+
+ // Try to inline DuplicateFunction (cloned from F with call to
+ // the OutlinedFunction into its callers. Return true
+ // if there is any successful inlining.
+ bool tryPartialInline(Function *DuplicateFunction,
+ Function *F, /*orignal function */
+ FunctionOutliningInfo *OI, Function *OutlinedFunction,
+ BlockFrequencyInfo *CalleeBFI);
+
+ // Compute the mapping from use site of DuplicationFunction to the enclosing
+ // BB's profile count.
+ void computeCallsiteToProfCountMap(Function *DuplicateFunction,
+ DenseMap<User *, uint64_t> &SiteCountMap);
+
bool IsLimitReached() {
return (MaxNumPartialInlining != -1 &&
NumPartialInlining >= MaxNumPartialInlining);
}
+
+ CallSite getCallSite(User *U) {
+ CallSite CS;
+ if (CallInst *CI = dyn_cast<CallInst>(U))
+ CS = CallSite(CI);
+ else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+ CS = CallSite(II);
+ else
+ llvm_unreachable("All uses must be calls");
+ return CS;
+ }
+
+ CallSite getOneCallSiteTo(Function *F) {
+ User *User = *F->user_begin();
+ return getCallSite(User);
+ }
+
+ std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+ CallSite CS = getOneCallSiteTo(F);
+ DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+ BasicBlock *Block = CS.getParent();
+ return std::make_tuple(DLoc, Block);
+ }
+
+ // Returns the costs associated with function outlining:
+ // - The first value is the non-weighted runtime cost for making the call
+ // to the outlined function 'OutlinedFunction', including the addtional
+ // setup cost in the outlined function itself;
+ // - The second value is the estimated size of the new call sequence in
+ // basic block 'OutliningCallBB';
+ // - The third value is the estimated size of the original code from
+ // function 'F' that is extracted into the outlined function.
+ std::tuple<int, int, int>
+ computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
+ Function *OutlinedFunction,
+ BasicBlock *OutliningCallBB);
+ // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+ // approximate both the size and runtime cost (Note that in the current
+ // inline cost analysis, there is no clear distinction there either).
+ int computeBBInlineCost(BasicBlock *BB);
+
+ std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
+
};
struct PartialInlinerLegacyPass : public ModulePass {
@@ -157,7 +243,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
return isa<ReturnInst>(TI);
};
- auto GetReturnBlock = [=](BasicBlock *Succ1, BasicBlock *Succ2) {
+ auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
if (IsReturnBlock(Succ1))
return std::make_tuple(Succ1, Succ2);
if (IsReturnBlock(Succ2))
@@ -167,7 +253,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
};
// Detect a triangular shape:
- auto GetCommonSucc = [=](BasicBlock *Succ1, BasicBlock *Succ2) {
+ auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
if (IsSuccessor(Succ1, Succ2))
return std::make_tuple(Succ1, Succ2);
if (IsSuccessor(Succ2, Succ1))
@@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
// Do sanity check of the entries: threre should not
// be any successors (not in the entry set) other than
// {ReturnBlock, NonReturnBlock}
- assert(OutliningInfo->Entries[0] == &F->front());
+ assert(OutliningInfo->Entries[0] == &F->front() &&
+ "Function Entry must be the first in Entries vector");
DenseSet<BasicBlock *> Entries;
for (BasicBlock *E : OutliningInfo->Entries)
Entries.insert(E);
@@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
return OutliningInfo;
}
-bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
- OptimizationRemarkEmitter &ORE) {
- // TODO : more sharing with shouldInline in Inliner.cpp
+// Check if there is PGO data or user annoated branch data:
+static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
+ if (F->getEntryCount())
+ return true;
+ // Now check if any of the entry block has MD_prof data:
+ for (auto *E : OI->Entries) {
+ BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+ if (!BR || BR->isUnconditional())
+ continue;
+ uint64_t T, F;
+ if (BR->extractProfMetadata(T, F))
+ return true;
+ }
+ return false;
+}
+
+BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
+ Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction,
+ BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) {
+
+ auto EntryFreq =
+ BFI->getBlockFreq(&DuplicateFunction->getEntryBlock());
+ auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB);
+
+ auto OutlineRegionRelFreq =
+ BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
+ EntryFreq.getFrequency());
+
+ if (hasProfileData(F, OI))
+ return OutlineRegionRelFreq;
+
+ // When profile data is not available, we need to be very
+ // conservative in estimating the overall savings. We need to make sure
+ // the outline region relative frequency is not below the threshold
+ // specified by the option.
+ OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+ return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+ CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI,
+ BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB,
+ int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {
using namespace ore;
+ if (SkipCostAnalysis)
+ return true;
+
Instruction *Call = CS.getInstruction();
Function *Callee = CS.getCalledFunction();
Function *Caller = CS.getCaller();
@@ -302,36 +433,170 @@ bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
if (IC.isAlways()) {
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
- << NV("Callee", Callee)
+ << NV("Callee", F)
<< " should always be fully inlined, not partially");
return false;
}
if (IC.isNever()) {
ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
- << NV("Callee", Callee) << " not partially inlined into "
+ << NV("Callee", F) << " not partially inlined into "
<< NV("Caller", Caller)
<< " because it should never be inlined (cost=never)");
return false;
}
if (!IC) {
- ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
- << NV("Callee", Callee) << " not partially inlined into "
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+ << NV("Callee", F) << " not partially inlined into "
<< NV("Caller", Caller) << " because too costly to inline (cost="
<< NV("Cost", IC.getCost()) << ", threshold="
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return false;
}
+ const DataLayout &DL = Caller->getParent()->getDataLayout();
+ // The savings of eliminating the call:
+ int NonWeightedSavings = getCallsiteCost(CS, DL);
+ BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+ auto RelativeFreq =
+ getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB);
+ auto NormWeightedRcost =
+ BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
+
+ // Weighted saving is smaller than weighted cost, return false
+ if (NormWeightedSavings < NormWeightedRcost) {
+ ORE.emit(
+ OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
+ << NV("Callee", F) << " not partially inlined into "
+ << NV("Caller", Caller) << " runtime overhead (overhead="
+ << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
+ << ", savings="
+ << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
+ << " of making the outlined call is too high");
+
+ return false;
+ }
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
- << NV("Callee", Callee) << " can be partially inlined into "
+ << NV("Callee", F) << " can be partially inlined into "
<< NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
<< " (threshold="
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return true;
}
+// TODO: Ideally we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+ int InlineCost = 0;
+ const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ InlineCost += getCallsiteCost(CallSite(CI), DL);
+ continue;
+ }
+
+ if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+ InlineCost += getCallsiteCost(CallSite(II), DL);
+ continue;
+ }
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+ continue;
+ }
+ InlineCost += InlineConstants::InstrCost;
+ }
+ return InlineCost;
+}
+
+std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
+ Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
+ BasicBlock *OutliningCallBB) {
+ // First compute the cost of the outlined region 'OI' in the original
+ // function 'F':
+ int OutlinedRegionCost = 0;
+ for (BasicBlock &BB : *F) {
+ if (&BB != OI->ReturnBlock &&
+ // Assuming Entry set is small -- do a linear search here:
+ std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
+ OI->Entries.end()) {
+ OutlinedRegionCost += computeBBInlineCost(&BB);
+ }
+ }
+
+ // Now compute the cost of the call sequence to the outlined function
+ // 'OutlinedFunction' in BB 'OutliningCallBB':
+ int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
+
+ // Now compute the cost of the extracted/outlined function itself:
+ int OutlinedFunctionCost = 0;
+ for (BasicBlock &BB : *OutlinedFunction) {
+ OutlinedFunctionCost += computeBBInlineCost(&BB);
+ }
+
+ assert(OutlinedFunctionCost >= OutlinedRegionCost &&
+ "Outlined function cost should be no less than the outlined region");
+ int OutliningRuntimeOverhead =
+ OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost);
+
+ return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
+ OutlinedRegionCost);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+ Function *DuplicateFunction,
+ DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+ std::vector<User *> Users(DuplicateFunction->user_begin(),
+ DuplicateFunction->user_end());
+ Function *CurrentCaller = nullptr;
+ BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+ auto ComputeCurrBFI = [&,this](Function *Caller) {
+ // For the old pass manager:
+ if (!GetBFI) {
+ if (CurrentCallerBFI)
+ delete CurrentCallerBFI;
+ DominatorTree DT(*Caller);
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*Caller, LI);
+ CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI);
+ } else {
+ // New pass manager:
+ CurrentCallerBFI = &(*GetBFI)(*Caller);
+ }
+ };
+
+ for (User *User : Users) {
+ CallSite CS = getCallSite(User);
+ Function *Caller = CS.getCaller();
+ if (CurrentCaller != Caller) {
+ CurrentCaller = Caller;
+ ComputeCurrBFI(Caller);
+ } else {
+ assert(CurrentCallerBFI && "CallerBFI is not set");
+ }
+ BasicBlock *CallBB = CS.getInstruction()->getParent();
+ auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+ if (Count)
+ CallSiteToProfCountMap[User] = *Count;
+ else
+ CallSiteToProfCountMap[User] = 0;
+ }
+ if (!GetBFI) {
+ if (CurrentCallerBFI)
+ delete CurrentCallerBFI;
+ }
+}
+
Function *PartialInlinerImpl::unswitchFunction(Function *F) {
if (F->hasAddressTaken())
@@ -347,21 +612,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
if (PSI->isFunctionEntryCold(F))
return nullptr;
- std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
- computeOutliningInfo(F);
+ if (F->user_begin() == F->user_end())
+ return nullptr;
+
+ std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
- if (!OutliningInfo)
+ if (!OI)
return nullptr;
// Clone the function, so that we can hack away on it.
ValueToValueMapTy VMap;
Function *DuplicateFunction = CloneFunction(F, VMap);
- BasicBlock *NewReturnBlock =
- cast<BasicBlock>(VMap[OutliningInfo->ReturnBlock]);
- BasicBlock *NewNonReturnBlock =
- cast<BasicBlock>(VMap[OutliningInfo->NonReturnBlock]);
+ BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+ BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
DenseSet<BasicBlock *> NewEntries;
- for (BasicBlock *BB : OutliningInfo->Entries) {
+ for (BasicBlock *BB : OI->Entries) {
NewEntries.insert(cast<BasicBlock>(VMap[BB]));
}
@@ -390,7 +655,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
BasicBlock *PreReturn = NewReturnBlock;
// only split block when necessary:
PHINode *FirstPhi = getFirstPHI(PreReturn);
- unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size();
+ unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();
if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {
NewReturnBlock = NewReturnBlock->splitBasicBlock(
@@ -408,14 +673,14 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
Ins = NewReturnBlock->getFirstNonPHI();
RetPhi->addIncoming(&*I, PreReturn);
- for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) {
+ for (BasicBlock *E : OI->ReturnBlockPreds) {
BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);
OldPhi->removeIncomingValue(NewE);
}
++I;
}
- for (auto E : OutliningInfo->ReturnBlockPreds) {
+ for (auto E : OI->ReturnBlockPreds) {
BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
}
@@ -423,7 +688,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
// Returns true if the block is to be partial inlined into the caller
// (i.e. not to be extracted to the out of line function)
- auto ToBeInlined = [=](BasicBlock *BB) {
+ auto ToBeInlined = [&](BasicBlock *BB) {
return BB == NewReturnBlock || NewEntries.count(BB);
};
// Gather up the blocks that we're going to extract.
@@ -443,50 +708,113 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) {
BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
// Extract the body of the if.
- Function *ExtractedFunction =
+ Function *OutlinedFunction =
CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
.extractCodeRegion();
- // Inline the top-level if test into all callers.
+ bool AnyInline =
+ tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI);
+
+ // Ditch the duplicate, since we're done with it, and rewrite all remaining
+ // users (function pointers, etc.) back to the original function.
+ DuplicateFunction->replaceAllUsesWith(F);
+ DuplicateFunction->eraseFromParent();
+
+ if (AnyInline)
+ return OutlinedFunction;
+
+ // Remove the function that is speculatively created:
+ if (OutlinedFunction)
+ OutlinedFunction->eraseFromParent();
+
+ return nullptr;
+}
+
+bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
+ Function *F,
+ FunctionOutliningInfo *OI,
+ Function *OutlinedFunction,
+ BlockFrequencyInfo *CalleeBFI) {
+ if (OutlinedFunction == nullptr)
+ return false;
+
+ int NonWeightedRcost;
+ int SizeCost;
+ int OutlinedRegionSizeCost;
+
+ auto OutliningCallBB =
+ getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent();
+
+ std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
+ computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB);
+
+ // The call sequence to the outlined function is larger than the original
+ // outlined region size, it does not increase the chances of inlining
+ // 'F' with outlining (The inliner usies the size increase to model the
+ // the cost of inlining a callee).
+ if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
+ OptimizationRemarkEmitter ORE(F);
+ DebugLoc DLoc;
+ BasicBlock *Block;
+ std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+ DLoc, Block)
+ << ore::NV("Function", F)
+ << " not partially inlined into callers (Original Size = "
+ << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
+ << ", Size of call sequence to outlined function = "
+ << ore::NV("NewSize", SizeCost) << ")");
+ return false;
+ }
+
+ assert(F->user_begin() == F->user_end() &&
+ "F's users should all be replaced!");
std::vector<User *> Users(DuplicateFunction->user_begin(),
DuplicateFunction->user_end());
+ DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+ if (F->getEntryCount())
+ computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap);
+
+ auto CalleeEntryCount = F->getEntryCount();
+ uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+ bool AnyInline = false;
for (User *User : Users) {
- CallSite CS;
- if (CallInst *CI = dyn_cast<CallInst>(User))
- CS = CallSite(CI);
- else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
- CS = CallSite(II);
- else
- llvm_unreachable("All uses must be calls");
+ CallSite CS = getCallSite(User);
if (IsLimitReached())
continue;
OptimizationRemarkEmitter ORE(CS.getCaller());
- if (!shouldPartialInline(CS, ORE))
+
+ if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB,
+ NonWeightedRcost, ORE))
continue;
- DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
- BasicBlock *Block = CS.getParent();
- ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block)
- << ore::NV("Callee", F) << " partially inlined into "
- << ore::NV("Caller", CS.getCaller()));
+ ORE.emit(
+ OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
+ << ore::NV("Callee", F) << " partially inlined into "
+ << ore::NV("Caller", CS.getCaller()));
- InlineFunctionInfo IFI(nullptr, GetAssumptionCache);
+ InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
InlineFunction(CS, IFI);
+
+ // Now update the entry count:
+ if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+ uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+ CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+ }
+
+ AnyInline = true;
NumPartialInlining++;
- // update stats
+ // Update the stats
NumPartialInlined++;
}
- // Ditch the duplicate, since we're done with it, and rewrite all remaining
- // users (function pointers, etc.) back to the original function.
- DuplicateFunction->replaceAllUsesWith(F);
- DuplicateFunction->eraseFromParent();
-
+ if (AnyInline && CalleeEntryCount)
+ F->setEntryCount(CalleeEntryCountV);
- return ExtractedFunction;
+ return AnyInline;
}
bool PartialInlinerImpl::run(Module &M) {
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index d3a3c24ce7b4..659cb9df00a2 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -16,6 +16,7 @@
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TypeMetadataUtils.h"
#include "llvm/Bitcode/BitcodeWriter.h"
#include "llvm/IR/Constants.h"
@@ -178,7 +179,7 @@ void filterModule(
else
GO = new GlobalVariable(
*M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
- (Constant *)nullptr, "", (GlobalVariable *)nullptr,
+ nullptr, "", nullptr,
GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
GO->takeName(GA);
GA->replaceAllUsesWith(GO);
@@ -320,7 +321,8 @@ void splitAndWriteThinLTOBitcode(
// FIXME: Try to re-use BSI and PFI from the original module here.
- ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr);
+ ProfileSummaryInfo PSI(M);
+ ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
SmallVector<char, 0> Buffer;
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 153a186d5ed4..0ca62b7ae40c 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -847,92 +847,6 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
}
-/// \brief Return true if we can prove that adding the two values of the
-/// knownbits will not overflow.
-/// Otherwise return false.
-static bool checkRippleForAdd(const KnownBits &LHSKnown,
- const KnownBits &RHSKnown) {
- // Addition of two 2's complement numbers having opposite signs will never
- // overflow.
- if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
- (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
- return true;
-
- // If either of the values is known to be non-negative, adding them can only
- // overflow if the second is also non-negative, so we can assume that.
- // Two non-negative numbers will only overflow if there is a carry to the
- // sign bit, so we can check if even when the values are as big as possible
- // there is no overflow to the sign bit.
- if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
- APInt MaxLHS = ~LHSKnown.Zero;
- MaxLHS.clearSignBit();
- APInt MaxRHS = ~RHSKnown.Zero;
- MaxRHS.clearSignBit();
- APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
- return Result.isSignBitClear();
- }
-
- // If either of the values is known to be negative, adding them can only
- // overflow if the second is also negative, so we can assume that.
- // Two negative number will only overflow if there is no carry to the sign
- // bit, so we can check if even when the values are as small as possible
- // there is overflow to the sign bit.
- if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
- APInt MinLHS = LHSKnown.One;
- MinLHS.clearSignBit();
- APInt MinRHS = RHSKnown.One;
- MinRHS.clearSignBit();
- APInt Result = std::move(MinLHS) + std::move(MinRHS);
- return Result.isSignBitSet();
- }
-
- // If we reached here it means that we know nothing about the sign bits.
- // In this case we can't know if there will be an overflow, since by
- // changing the sign bits any two values can be made to overflow.
- return false;
-}
-
-/// Return true if we can prove that:
-/// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS))
-/// This basically requires proving that the add in the original type would not
-/// overflow to change the sign bit or have a carry out.
-bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
- Instruction &CxtI) {
- // There are different heuristics we can use for this. Here are some simple
- // ones.
-
- // If LHS and RHS each have at least two sign bits, the addition will look
- // like
- //
- // XX..... +
- // YY.....
- //
- // If the carry into the most significant position is 0, X and Y can't both
- // be 1 and therefore the carry out of the addition is also 0.
- //
- // If the carry into the most significant position is 1, X and Y can't both
- // be 0 and therefore the carry out of the addition is also 1.
- //
- // Since the carry into the most significant position is always equal to
- // the carry out of the addition, there is no signed overflow.
- if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
- ComputeNumSignBits(RHS, 0, &CxtI) > 1)
- return true;
-
- unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
- KnownBits LHSKnown(BitWidth);
- computeKnownBits(LHS, LHSKnown, 0, &CxtI);
-
- KnownBits RHSKnown(BitWidth);
- computeKnownBits(RHS, RHSKnown, 0, &CxtI);
-
- // Check if carry bit of addition will not cause overflow.
- if (checkRippleForAdd(LHSKnown, RHSKnown))
- return true;
-
- return false;
-}
-
/// \brief Return true if we can prove that:
/// (sub LHS, RHS) === (sub nsw LHS, RHS)
/// This basically requires proving that the add in the original type would not
@@ -968,13 +882,9 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
Instruction &CxtI) {
// If the LHS is negative and the RHS is non-negative, no unsigned wrap.
- bool LHSKnownNonNegative, LHSKnownNegative;
- bool RHSKnownNonNegative, RHSKnownNegative;
- ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0,
- &CxtI);
- ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0,
- &CxtI);
- if (LHSKnownNegative && RHSKnownNonNegative)
+ KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+ KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+ if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
return true;
return false;
@@ -1041,6 +951,57 @@ static Value *checkForNegativeOperand(BinaryOperator &I,
return nullptr;
}
+static Instruction *foldAddWithConstant(BinaryOperator &Add,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+ const APInt *C;
+ if (!match(Op1, m_APInt(C)))
+ return nullptr;
+
+ if (C->isSignMask()) {
+ // If wrapping is not allowed, then the addition must set the sign bit:
+ // X + (signmask) --> X | signmask
+ if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap())
+ return BinaryOperator::CreateOr(Op0, Op1);
+
+ // If wrapping is allowed, then the addition flips the sign bit of LHS:
+ // X + (signmask) --> X ^ signmask
+ return BinaryOperator::CreateXor(Op0, Op1);
+ }
+
+ Value *X;
+ const APInt *C2;
+ Type *Ty = Add.getType();
+
+ // Is this add the last step in a convoluted sext?
+ // add(zext(xor i16 X, -32768), -32768) --> sext X
+ if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
+ C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
+ return CastInst::Create(Instruction::SExt, X, Ty);
+
+ // (add (zext (add nuw X, C2)), C) --> (zext (add nuw X, C2 + C))
+ // FIXME: This should check hasOneUse to not increase the instruction count?
+ if (C->isNegative() &&
+ match(Op0, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2)))) &&
+ C->sge(-C2->sext(C->getBitWidth()))) {
+ Constant *NewC =
+ ConstantInt::get(X->getType(), *C2 + C->trunc(C2->getBitWidth()));
+ return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
+ }
+
+ // Shifts and add used to flip and mask off the low bit:
+ // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
+ const APInt *C3;
+ if (*C == 1 && match(Op0, m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)),
+ m_APInt(C3)))) &&
+ C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
+ Value *NotX = Builder.CreateNot(X);
+ return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
bool Changed = SimplifyAssociativeOrCommutative(I);
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
@@ -1056,41 +1017,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
if (Value *V = SimplifyUsingDistributiveLaws(I))
return replaceInstUsesWith(I, V);
- const APInt *RHSC;
- if (match(RHS, m_APInt(RHSC))) {
- if (RHSC->isSignMask()) {
- // If wrapping is not allowed, then the addition must set the sign bit:
- // X + (signmask) --> X | signmask
- if (I.hasNoSignedWrap() || I.hasNoUnsignedWrap())
- return BinaryOperator::CreateOr(LHS, RHS);
-
- // If wrapping is allowed, then the addition flips the sign bit of LHS:
- // X + (signmask) --> X ^ signmask
- return BinaryOperator::CreateXor(LHS, RHS);
- }
-
- // Is this add the last step in a convoluted sext?
- Value *X;
- const APInt *C;
- if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) &&
- C->isMinSignedValue() &&
- C->sext(LHS->getType()->getScalarSizeInBits()) == *RHSC) {
- // add(zext(xor i16 X, -32768), -32768) --> sext X
- return CastInst::Create(Instruction::SExt, X, LHS->getType());
- }
-
- if (RHSC->isNegative() &&
- match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
- RHSC->sge(-C->sext(RHSC->getBitWidth()))) {
- // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
- Constant *NewC =
- ConstantInt::get(X->getType(), *C + RHSC->trunc(C->getBitWidth()));
- return new ZExtInst(Builder->CreateNUWAdd(X, NewC), I.getType());
- }
- }
+ if (Instruction *X = foldAddWithConstant(I, *Builder))
+ return X;
- // FIXME: Use the match above instead of dyn_cast to allow these transforms
- // for splat vectors.
+ // FIXME: This should be moved into the above helper function to allow these
+ // transforms for splat vectors.
if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
// zext(bool) + C -> bool ? C + 1 : C
if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
@@ -1285,8 +1216,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
Constant *CI =
ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
if (ConstantExpr::getZExt(CI, I.getType()) == RHSC &&
- computeOverflowForUnsignedAdd(LHSConv->getOperand(0), CI, &I) ==
- OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedAdd(LHSConv->getOperand(0), CI, I)) {
// Insert the new, smaller add.
Value *NewAdd =
Builder->CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv");
@@ -1303,9 +1233,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
if (LHSConv->getOperand(0)->getType() ==
RHSConv->getOperand(0)->getType() &&
(LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
- computeOverflowForUnsignedAdd(LHSConv->getOperand(0),
- RHSConv->getOperand(0),
- &I) == OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedAdd(LHSConv->getOperand(0),
+ RHSConv->getOperand(0), I)) {
// Insert the new integer add.
Value *NewAdd = Builder->CreateNUWAdd(
LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv");
@@ -1347,15 +1276,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
}
// TODO(jingyue): Consider WillNotOverflowSignedAdd and
- // WillNotOverflowUnsignedAdd to reduce the number of invocations of
+ // willNotOverflowUnsignedAdd to reduce the number of invocations of
// computeKnownBits.
if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) {
Changed = true;
I.setHasNoSignedWrap(true);
}
- if (!I.hasNoUnsignedWrap() &&
- computeOverflowForUnsignedAdd(LHS, RHS, &I) ==
- OverflowResult::NeverOverflows) {
+ if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
Changed = true;
I.setHasNoUnsignedWrap(true);
}
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b114801cc1c0..82dc88f1b3ad 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -23,21 +23,6 @@ using namespace PatternMatch;
#define DEBUG_TYPE "instcombine"
-static inline Value *dyn_castNotVal(Value *V) {
- // If this is not(not(x)) don't return that this is a not: we want the two
- // not's to be folded first.
- if (BinaryOperator::isNot(V)) {
- Value *Operand = BinaryOperator::getNotArgument(V);
- if (!IsFreeToInvert(Operand, Operand->hasOneUse()))
- return Operand;
- }
-
- // Constants can be considered to be not'ed values...
- if (ConstantInt *C = dyn_cast<ConstantInt>(V))
- return ConstantInt::get(C->getType(), ~C->getValue());
- return nullptr;
-}
-
/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
/// a four bit mask.
static unsigned getFCmpCode(FCmpInst::Predicate CC) {
@@ -713,9 +698,8 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
}
// This simplification is only valid if the upper range is not negative.
- bool IsNegative, IsNotNegative;
- ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1);
- if (!IsNotNegative)
+ KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1);
+ if (!Known.isNonNegative())
return nullptr;
if (Inverted)
@@ -1013,26 +997,22 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
/// (~A & ~B) == (~(A | B))
/// (~A | ~B) == (~(A & B))
static Instruction *matchDeMorgansLaws(BinaryOperator &I,
- InstCombiner::BuilderTy *Builder) {
+ InstCombiner::BuilderTy &Builder) {
auto Opcode = I.getOpcode();
assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
"Trying to match De Morgan's Laws with something other than and/or");
+
// Flip the logic operation.
- if (Opcode == Instruction::And)
- Opcode = Instruction::Or;
- else
- Opcode = Instruction::And;
+ Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
- Value *Op0 = I.getOperand(0);
- Value *Op1 = I.getOperand(1);
- // TODO: Use pattern matchers instead of dyn_cast.
- if (Value *Op0NotVal = dyn_castNotVal(Op0))
- if (Value *Op1NotVal = dyn_castNotVal(Op1))
- if (Op0->hasOneUse() && Op1->hasOneUse()) {
- Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal,
- I.getName() + ".demorgan");
- return BinaryOperator::CreateNot(LogicOp);
- }
+ Value *A, *B;
+ if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
+ match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
+ !IsFreeToInvert(A, A->hasOneUse()) &&
+ !IsFreeToInvert(B, B->hasOneUse())) {
+ Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
+ return BinaryOperator::CreateNot(AndOr);
+ }
return nullptr;
}
@@ -1376,7 +1356,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
return FoldedLogic;
- if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+ if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))
return DeMorgan;
{
@@ -2005,18 +1985,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
if (Value *V = SimplifyBSwap(I))
return replaceInstUsesWith(I, V);
- if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
- ConstantInt *C1 = nullptr; Value *X = nullptr;
- // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
- if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) &&
- Op0->hasOneUse()) {
- Value *Or = Builder->CreateOr(X, RHS);
- Or->takeName(Op0);
- return BinaryOperator::CreateXor(Or,
- Builder->getInt(C1->getValue() & ~RHS->getValue()));
- }
- }
-
if (isa<Constant>(Op1))
if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
return FoldedLogic;
@@ -2167,7 +2135,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C));
- if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+ if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder))
return DeMorgan;
// Canonicalize xor to the RHS.
@@ -2399,27 +2367,44 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
}
// Is this a 'not' (~) fed by a binary operator?
- BinaryOperator *NotOp;
- if (match(&I, m_Not(m_BinOp(NotOp)))) {
- if (NotOp->getOpcode() == Instruction::And ||
- NotOp->getOpcode() == Instruction::Or) {
+ BinaryOperator *NotVal;
+ if (match(&I, m_Not(m_BinOp(NotVal)))) {
+ if (NotVal->getOpcode() == Instruction::And ||
+ NotVal->getOpcode() == Instruction::Or) {
// Apply DeMorgan's Law when inverts are free:
// ~(X & Y) --> (~X | ~Y)
// ~(X | Y) --> (~X & ~Y)
- if (IsFreeToInvert(NotOp->getOperand(0),
- NotOp->getOperand(0)->hasOneUse()) &&
- IsFreeToInvert(NotOp->getOperand(1),
- NotOp->getOperand(1)->hasOneUse())) {
- Value *NotX = Builder->CreateNot(NotOp->getOperand(0), "notlhs");
- Value *NotY = Builder->CreateNot(NotOp->getOperand(1), "notrhs");
- if (NotOp->getOpcode() == Instruction::And)
+ if (IsFreeToInvert(NotVal->getOperand(0),
+ NotVal->getOperand(0)->hasOneUse()) &&
+ IsFreeToInvert(NotVal->getOperand(1),
+ NotVal->getOperand(1)->hasOneUse())) {
+ Value *NotX = Builder->CreateNot(NotVal->getOperand(0), "notlhs");
+ Value *NotY = Builder->CreateNot(NotVal->getOperand(1), "notrhs");
+ if (NotVal->getOpcode() == Instruction::And)
return BinaryOperator::CreateOr(NotX, NotY);
return BinaryOperator::CreateAnd(NotX, NotY);
}
- } else if (NotOp->getOpcode() == Instruction::AShr) {
- // ~(~X >>s Y) --> (X >>s Y)
- if (Value *Op0NotVal = dyn_castNotVal(NotOp->getOperand(0)))
- return BinaryOperator::CreateAShr(Op0NotVal, NotOp->getOperand(1));
+ }
+
+ // ~(~X >>s Y) --> (X >>s Y)
+ if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
+ return BinaryOperator::CreateAShr(X, Y);
+
+ // If we are inverting a right-shifted constant, we may be able to eliminate
+ // the 'not' by inverting the constant and using the opposite shift type.
+ // Canonicalization rules ensure that only a negative constant uses 'ashr',
+ // but we must check that in case that transform has not fired yet.
+ const APInt *C;
+ if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) {
+ // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
+ Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+ return BinaryOperator::CreateLShr(NotC, Y);
+ }
+
+ if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) {
+ // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
+ Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+ return BinaryOperator::CreateAShr(NotC, Y);
}
}
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 6989d67f0060..face7abcc95f 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1384,10 +1384,10 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
// Create a mask for bits above (ctlz) or below (cttz) the first known one.
bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
- unsigned PossibleZeros = IsTZ ? Known.One.countTrailingZeros()
- : Known.One.countLeadingZeros();
- unsigned DefiniteZeros = IsTZ ? Known.Zero.countTrailingOnes()
- : Known.Zero.countLeadingOnes();
+ unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
+ : Known.countMaxLeadingZeros();
+ unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
+ : Known.countMinLeadingZeros();
// If all bits above (ctlz) or below (cttz) the first known one are known
// zero, this value is constant.
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 312d9baae43a..001a4bcf16f3 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -559,6 +559,9 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
}
+ // FIXME: Maybe combine the next two transforms to handle the no cast case
+ // more efficiently. Support vector types. Cleanup code by using m_OneUse.
+
// Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
Value *A = nullptr; ConstantInt *Cst = nullptr;
if (Src->hasOneUse() &&
@@ -588,15 +591,20 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
// the sign bit of the original value; performing ashr instead of lshr
// generates bits of the same value as the sign bit.
if (Src->hasOneUse() &&
- match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) &&
- cast<Instruction>(Src)->getOperand(0)->hasOneUse()) {
+ match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) {
+ Value *SExt = cast<Instruction>(Src)->getOperand(0);
+ const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();
const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+ unsigned ShiftAmt = Cst->getZExtValue();
// This optimization can be only performed when zero bits generated by
// the original lshr aren't pulled into the value after truncation, so we
- // can only shift by values smaller than the size of destination type (in
- // bits).
- if (Cst->getValue().ult(ASize)) {
- Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue());
+ // can only shift by values no larger than the number of extension bits.
+ // FIXME: Instead of bailing when the shift is too large, use and to clear
+ // the extra bits.
+ if (SExt->hasOneUse() && ShiftAmt <= SExtSize - ASize) {
+ // If shifting by the size of the original value in bits or more, it is
+ // being filled with the sign bit, so shift by ASize-1 to avoid ub.
+ Value *Shift = Builder->CreateAShr(A, std::min(ShiftAmt, ASize-1));
Shift->takeName(Src);
return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
}
@@ -1180,9 +1188,8 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
// If we know that the value being extended is positive, we can use a zext
// instead.
- bool KnownZero, KnownOne;
- ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI);
- if (KnownZero) {
+ KnownBits Known = computeKnownBits(Src, 0, &CI);
+ if (Known.isNonNegative()) {
Value *ZExt = Builder->CreateZExt(Src, DestTy);
return replaceInstUsesWith(CI, ZExt);
}
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 34ce235b3fe2..60ed4057cedd 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2785,6 +2785,9 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
}
/// Try to fold icmp (binop), X or icmp X, (binop).
+/// TODO: A large part of this logic is duplicated in InstSimplify's
+/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
+/// duplication.
Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -2794,7 +2797,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
if (!BO0 && !BO1)
return nullptr;
- CmpInst::Predicate Pred = I.getPredicate();
+ const CmpInst::Predicate Pred = I.getPredicate();
bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
if (BO0 && isa<OverflowingBinaryOperator>(BO0))
NoOp0WrapProblem =
@@ -3029,21 +3032,20 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
case Instruction::Sub:
case Instruction::Xor:
if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
// icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
if (CI->getValue().isSignMask()) {
- ICmpInst::Predicate Pred =
+ ICmpInst::Predicate NewPred =
I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+ return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
}
if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
- ICmpInst::Predicate Pred =
+ ICmpInst::Predicate NewPred =
I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
- Pred = I.getSwappedPredicate(Pred);
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+ NewPred = I.getSwappedPredicate(NewPred);
+ return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
}
}
break;
@@ -3062,21 +3064,27 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
AP.getBitWidth() - AP.countTrailingZeros()));
Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);
Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask);
- return new ICmpInst(I.getPredicate(), And1, And2);
+ return new ICmpInst(Pred, And1, And2);
}
}
break;
+
case Instruction::UDiv:
case Instruction::LShr:
- if (I.isSigned())
+ if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
break;
- LLVM_FALLTHROUGH;
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
case Instruction::SDiv:
+ if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+ break;
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
case Instruction::AShr:
if (!BO0->isExact() || !BO1->isExact())
break;
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
case Instruction::Shl: {
bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
@@ -3084,8 +3092,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
break;
if (!NSW && I.isSigned())
break;
- return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
- BO1->getOperand(0));
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
}
}
}
@@ -3096,7 +3103,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
auto BitwiseAnd =
m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value()));
- if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) {
+ if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
auto *Zero = Constant::getNullValue(BO0->getType());
return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
}
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 3be6419a129a..1424f61fe701 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -30,6 +30,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -388,10 +389,21 @@ private:
bool DoTransform = true);
Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
- bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI);
+ bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+ return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ };
+ bool willNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) {
+ return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ };
bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI);
+ bool willNotOverflowUnsignedMul(Value *LHS, Value *RHS, Instruction &CxtI) {
+ return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ };
Value *EmitGEPOffset(User *GEP);
Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
@@ -492,7 +504,11 @@ public:
void computeKnownBits(Value *V, KnownBits &Known,
unsigned Depth, Instruction *CxtI) const {
- return llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+ llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+ }
+ KnownBits computeKnownBits(Value *V, unsigned Depth,
+ Instruction *CxtI) const {
+ return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
}
bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0,
@@ -503,11 +519,6 @@ public:
Instruction *CxtI = nullptr) const {
return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
}
- void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
- unsigned Depth = 0, Instruction *CxtI = nullptr) const {
- return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI,
- &DT);
- }
OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
const Instruction *CxtI) {
return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
@@ -516,6 +527,11 @@ public:
const Instruction *CxtI) {
return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
}
+ OverflowResult computeOverflowForSignedAdd(const Value *LHS,
+ const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
/// Maximum size of array considered when transforming.
uint64_t MaxArraySizeForCombine;
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 675553017838..a4d84ae81aa0 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -885,10 +885,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
// first non-zero index.
auto IsAllNonNegative = [&]() {
for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
- bool KnownNonNegative, KnownNegative;
- IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative,
- KnownNegative, 0, MemI);
- if (KnownNonNegative)
+ KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
+ if (Known.isNonNegative())
continue;
return false;
}
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index face9d9237ae..2a35259f2103 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -162,11 +162,9 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
// product is exactly the minimum negative number.
// E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
// For simplicity we just check if at least one side is not negative.
- bool LHSNonNegative, LHSNegative;
- bool RHSNonNegative, RHSNegative;
- ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI);
- ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI);
- if (LHSNonNegative || RHSNonNegative)
+ KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+ KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+ if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
return true;
}
return false;
@@ -422,8 +420,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
Constant *CI =
ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
if (ConstantExpr::getZExt(CI, I.getType()) == Op1C &&
- computeOverflowForUnsignedMul(Op0Conv->getOperand(0), CI, &I) ==
- OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedMul(Op0Conv->getOperand(0), CI, I)) {
// Insert the new, smaller mul.
Value *NewMul =
Builder->CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv");
@@ -440,9 +437,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
if (Op0Conv->getOperand(0)->getType() ==
Op1Conv->getOperand(0)->getType() &&
(Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
- computeOverflowForUnsignedMul(Op0Conv->getOperand(0),
- Op1Conv->getOperand(0),
- &I) == OverflowResult::NeverOverflows) {
+ willNotOverflowUnsignedMul(Op0Conv->getOperand(0),
+ Op1Conv->getOperand(0), I)) {
// Insert the new integer mul.
Value *NewMul = Builder->CreateNUWMul(
Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
@@ -456,9 +452,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
I.setHasNoSignedWrap(true);
}
- if (!I.hasNoUnsignedWrap() &&
- computeOverflowForUnsignedMul(Op0, Op1, &I) ==
- OverflowResult::NeverOverflows) {
+ if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
Changed = true;
I.setHasNoUnsignedWrap(true);
}
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 05b01774cd5e..4028a92771a4 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -611,7 +611,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
return I;
- unsigned Leaders = Known2.Zero.countLeadingOnes();
+ unsigned Leaders = Known2.countMinLeadingZeros();
Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
break;
}
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1792cb585f87..65b1148cb03b 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2212,9 +2212,9 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
// Canonicalize fcmp_one -> fcmp_oeq
FCmpInst::Predicate FPred; Value *Y;
- if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)),
- TrueDest, FalseDest)) &&
- BI.getCondition()->hasOneUse())
+ if (match(&BI, m_Br(m_OneUse(m_FCmp(FPred, m_Value(X), m_Value(Y))),
+ TrueDest, FalseDest))) {
+ // TODO: Why are we only transforming these 3 predicates?
if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
FPred == FCmpInst::FCMP_OGE) {
FCmpInst *Cond = cast<FCmpInst>(BI.getCondition());
@@ -2225,12 +2225,12 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
Worklist.Add(Cond);
return &BI;
}
+ }
// Canonicalize icmp_ne -> icmp_eq
ICmpInst::Predicate IPred;
- if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)),
- TrueDest, FalseDest)) &&
- BI.getCondition()->hasOneUse())
+ if (match(&BI, m_Br(m_OneUse(m_ICmp(IPred, m_Value(X), m_Value(Y))),
+ TrueDest, FalseDest))) {
if (IPred == ICmpInst::ICMP_NE || IPred == ICmpInst::ICMP_ULE ||
IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
IPred == ICmpInst::ICMP_SGE) {
@@ -2241,6 +2241,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
Worklist.Add(Cond);
return &BI;
}
+ }
return nullptr;
}
@@ -2264,8 +2265,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();
KnownBits Known(BitWidth);
computeKnownBits(Cond, Known, 0, &SI);
- unsigned LeadingKnownZeros = Known.Zero.countLeadingOnes();
- unsigned LeadingKnownOnes = Known.One.countLeadingOnes();
+ unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
+ unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
// Compute the number of leading bits we can ignore.
// TODO: A better way to determine this would use ComputeNumSignBits().
@@ -3141,7 +3142,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
// Lower dbg.declare intrinsics otherwise their value may be clobbered
// by instcombiner.
- bool DbgDeclaresChanged = LowerDbgDeclare(F);
+ bool MadeIRChange = LowerDbgDeclare(F);
// Iterate while there is work to do.
int Iteration = 0;
@@ -3150,18 +3151,17 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
<< F.getName() << "\n");
- bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
+ MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,
AA, AC, TLI, DT, DL, LI);
IC.MaxArraySizeForCombine = MaxArraySize;
- Changed |= IC.run();
- if (!Changed)
+ if (!IC.run())
break;
}
- return DbgDeclaresChanged || Iteration > 1;
+ return MadeIRChange || Iteration > 1;
}
PreservedAnalyses InstCombinePass::run(Function &F,
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b034ccc46933..7eea44d6aca0 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -613,7 +613,15 @@ public:
bool UseGlobalsGC = true)
: ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),
Recover(Recover || ClRecover),
- UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC) {}
+ UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+ // Not a typo: ClWithComdat is almost completely pointless without
+ // ClUseGlobalsGC (because then it only works on modules without
+ // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
+ // and both suffer from gold PR19002 for which UseGlobalsGC constructor
+ // argument is designed as workaround. Therefore, disable both
+ // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
+ // do globals-gc.
+ UseCtorComdat(UseGlobalsGC && ClWithComdat) {}
bool runOnModule(Module &M) override;
static char ID; // Pass identification, replacement for typeid
StringRef getPassName() const override { return "AddressSanitizerModule"; }
@@ -656,6 +664,7 @@ private:
bool CompileKernel;
bool Recover;
bool UseGlobalsGC;
+ bool UseCtorComdat;
Type *IntptrTy;
LLVMContext *C;
Triple TargetTriple;
@@ -1677,7 +1686,7 @@ AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
: GlobalVariable::PrivateLinkage;
GlobalVariable *Metadata = new GlobalVariable(
M, Initializer->getType(), false, Linkage, Initializer,
- Twine("__asan_global_") + GlobalValue::getRealLinkageName(OriginalName));
+ Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
Metadata->setSection(getGlobalMetadataSection());
return Metadata;
}
@@ -1782,7 +1791,7 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
// On recent Mach-O platforms, use a structure which binds the liveness of
// the global variable to the metadata struct. Keep the list of "Liveness" GV
// created to be added to llvm.compiler.used
- StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr);
+ StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
@@ -1793,9 +1802,9 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
// On recent Mach-O platforms, we emit the global metadata in a way that
// allows the linker to properly strip dead globals.
- auto LivenessBinder = ConstantStruct::get(
- LivenessTy, Initializer->getAggregateElement(0u),
- ConstantExpr::getPointerCast(Metadata, IntptrTy), nullptr);
+ auto LivenessBinder =
+ ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
+ ConstantExpr::getPointerCast(Metadata, IntptrTy));
GlobalVariable *Liveness = new GlobalVariable(
M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
Twine("__asan_binder_") + G->getName());
@@ -1893,7 +1902,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
// We initialize an array of such structures and pass it to a run-time call.
StructType *GlobalStructTy =
StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
- IntptrTy, IntptrTy, IntptrTy, nullptr);
+ IntptrTy, IntptrTy, IntptrTy);
SmallVector<GlobalVariable *, 16> NewGlobals(n);
SmallVector<Constant *, 16> Initializers(n);
@@ -1929,10 +1938,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
- StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr);
- Constant *NewInitializer =
- ConstantStruct::get(NewTy, G->getInitializer(),
- Constant::getNullValue(RightRedZoneTy), nullptr);
+ StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
+ Constant *NewInitializer = ConstantStruct::get(
+ NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
// Create a new global variable with enough space for a redzone.
GlobalValue::LinkageTypes Linkage = G->getLinkage();
@@ -2013,7 +2021,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
ConstantExpr::getPointerCast(Name, IntptrTy),
ConstantExpr::getPointerCast(ModuleName, IntptrTy),
ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
- ConstantExpr::getPointerCast(ODRIndicator, IntptrTy), nullptr);
+ ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
@@ -2073,7 +2081,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
// Put the constructor and destructor in comdat if both
// (1) global instrumentation is not TU-specific
// (2) target is ELF.
- if (ClWithComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
+ if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,
AsanCtorFunction);
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 8786781933ea..e2e3cbdbc295 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -388,7 +388,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
ArgTypes.push_back(ShadowPtrTy);
Type *RetType = T->getReturnType();
if (!RetType->isVoidTy())
- RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr);
+ RetType = StructType::get(RetType, ShadowTy);
return FunctionType::get(RetType, ArgTypes, T->isVarArg());
}
@@ -476,16 +476,14 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
GetArgTLS = ConstantExpr::getIntToPtr(
ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
PointerType::getUnqual(
- FunctionType::get(PointerType::getUnqual(ArgTLSTy),
- (Type *)nullptr)));
+ FunctionType::get(PointerType::getUnqual(ArgTLSTy), false)));
}
if (GetRetvalTLSPtr) {
RetvalTLS = nullptr;
GetRetvalTLS = ConstantExpr::getIntToPtr(
ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
PointerType::getUnqual(
- FunctionType::get(PointerType::getUnqual(ShadowTy),
- (Type *)nullptr)));
+ FunctionType::get(PointerType::getUnqual(ShadowTy), false)));
}
ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 7dea1dee756a..e89384c559fe 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -398,8 +398,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
// u64 *ArrayCounter;
// };
auto *StructInfoTy =
- StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
- Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr);
+ StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
+ Int8PtrPtrTy, Int64PtrTy, Int64PtrTy);
auto *StructInfoPtrTy = StructInfoTy->getPointerTo();
// This structure should be kept consistent with the CacheFragInfo struct
// in the runtime library.
@@ -408,8 +408,7 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
// u32 NumStructs;
// StructInfo *Structs;
// };
- auto *CacheFragInfoTy =
- StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr);
+ auto *CacheFragInfoTy = StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy);
std::vector<StructType *> Vec = M.getIdentifiedStructTypes();
unsigned NumStructs = 0;
@@ -457,24 +456,23 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
ArrayCounterIdx[1] = ConstantInt::get(Int32Ty,
getArrayCounterIdx(StructTy));
- Initializers.push_back(
- ConstantStruct::get(
- StructInfoTy,
- ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
- ConstantInt::get(Int32Ty,
- DL.getStructLayout(StructTy)->getSizeInBytes()),
- ConstantInt::get(Int32Ty, StructTy->getNumElements()),
- Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
- ConstantExpr::getPointerCast(Offset, Int32PtrTy),
- Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
- ConstantExpr::getPointerCast(Size, Int32PtrTy),
- TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) :
- ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
- ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
- FieldCounterIdx),
- ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
- ArrayCounterIdx),
- nullptr));
+ Initializers.push_back(ConstantStruct::get(
+ StructInfoTy,
+ ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
+ ConstantInt::get(Int32Ty,
+ DL.getStructLayout(StructTy)->getSizeInBytes()),
+ ConstantInt::get(Int32Ty, StructTy->getNumElements()),
+ Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+ : ConstantExpr::getPointerCast(Offset, Int32PtrTy),
+ Size == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+ : ConstantExpr::getPointerCast(Size, Int32PtrTy),
+ TypeName == nullptr
+ ? ConstantPointerNull::get(Int8PtrPtrTy)
+ : ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
+ ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+ FieldCounterIdx),
+ ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+ ArrayCounterIdx)));
}
// Structs.
Constant *StructInfo;
@@ -491,11 +489,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
auto *CacheFragInfoGV = new GlobalVariable(
M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage,
- ConstantStruct::get(CacheFragInfoTy,
- UnitName,
- ConstantInt::get(Int32Ty, NumStructs),
- StructInfo,
- nullptr));
+ ConstantStruct::get(CacheFragInfoTy, UnitName,
+ ConstantInt::get(Int32Ty, NumStructs), StructInfo));
return CacheFragInfoGV;
}
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 15333a5317dd..ff753c20a94a 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1576,13 +1576,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
bool Signed = false) {
Type *srcTy = V->getType();
+ size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+ size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+ if (srcSizeInBits > 1 && dstSizeInBits == 1)
+ return IRB.CreateICmpNE(V, getCleanShadow(V));
+
if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
return IRB.CreateIntCast(V, dstTy, Signed);
if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
return IRB.CreateIntCast(V, dstTy, Signed);
- size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
- size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
Value *V2 =
IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 3f1a77b49a44..ee493a8ec7e1 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -442,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
bool Changed = false;
if (!NUW) {
- ConstantRange NUWRange =
- LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
- OBO::NoUnsignedWrap);
+ ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinaryOperator::Add, LRange, OBO::NoUnsignedWrap);
if (!NUWRange.isEmptySet()) {
bool NewNUW = NUWRange.contains(LazyRRange());
AddOp->setHasNoUnsignedWrap(NewNUW);
@@ -452,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
}
}
if (!NSW) {
- ConstantRange NSWRange =
- LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
- OBO::NoSignedWrap);
+ ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinaryOperator::Add, LRange, OBO::NoSignedWrap);
if (!NSWRange.isEmptySet()) {
bool NewNSW = NSWRange.contains(LazyRRange());
AddOp->setHasNoSignedWrap(NewNSW);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 48d5ae88cda9..6693a26e8890 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -144,6 +144,10 @@ private:
bool recognizePopcount();
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
PHINode *CntPhi, Value *Var);
+ bool recognizeAndInsertCTLZ();
+ void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
+ PHINode *CntPhi, Value *Var, const DebugLoc DL,
+ bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
/// @}
};
@@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
}
bool LoopIdiomRecognize::runOnNoncountableLoop() {
- return recognizePopcount();
+ return recognizePopcount() || recognizeAndInsertCTLZ();
}
/// Check if the given conditional branch is based on the comparison between
@@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
return true;
}
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+/// or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+/// or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// if (x0 == 0)
+/// goto loop-exit // the precondition of the loop
+/// cnt0 = init-val;
+/// do {
+/// x = phi (x0, x.next); //PhiX
+/// cnt = phi(cnt0, cnt.next);
+///
+/// cnt.next = cnt + 1;
+/// ...
+/// x.next = x >> 1; // DefX
+/// ...
+/// } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
+ Instruction *&CntInst, PHINode *&CntPhi,
+ Instruction *&DefX) {
+ BasicBlock *LoopEntry;
+ Value *VarX = nullptr;
+
+ DefX = nullptr;
+ PhiX = nullptr;
+ CntInst = nullptr;
+ CntPhi = nullptr;
+ LoopEntry = *(CurLoop->block_begin());
+
+ // step 1: Check if the loop-back branch is in desirable form.
+ if (Value *T = matchCondition(
+ dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+ DefX = dyn_cast<Instruction>(T);
+ else
+ return false;
+
+ // step 2: detect instructions corresponding to "x.next = x >> 1"
+ if (!DefX || DefX->getOpcode() != Instruction::AShr)
+ return false;
+ if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
+ if (!Shft || !Shft->isOne())
+ return false;
+ VarX = DefX->getOperand(0);
+
+ // step 3: Check the recurrence of variable X
+ PhiX = dyn_cast<PHINode>(VarX);
+ if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX))
+ return false;
+
+ // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+ // TODO: We can skip the step. If loop trip count is known (CTLZ),
+ // then all uses of "cnt.next" could be optimized to the trip count
+ // plus "cnt0". Currently it is not optimized.
+ // This step could be used to detect POPCNT instruction:
+ // cnt.next = cnt + (x.next & 1)
+ for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+ IterE = LoopEntry->end();
+ Iter != IterE; Iter++) {
+ Instruction *Inst = &*Iter;
+ if (Inst->getOpcode() != Instruction::Add)
+ continue;
+
+ ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+ if (!Inc || !Inc->isOne())
+ continue;
+
+ PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+ if (!Phi || Phi->getParent() != LoopEntry)
+ continue;
+
+ CntInst = Inst;
+ CntPhi = Phi;
+ break;
+ }
+ if (!CntInst)
+ return false;
+
+ return true;
+}
+
+/// Recognize CTLZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ trip count).
+/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+ return false;
+
+ Instruction *CntInst, *DefX;
+ PHINode *CntPhi, *PhiX;
+ if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
+ return false;
+
+ bool IsCntPhiUsedOutsideLoop = false;
+ for (User *U : CntPhi->users())
+ if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ IsCntPhiUsedOutsideLoop = true;
+ break;
+ }
+ bool IsCntInstUsedOutsideLoop = false;
+ for (User *U : CntInst->users())
+ if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ IsCntInstUsedOutsideLoop = true;
+ break;
+ }
+ // If both CntInst and CntPhi are used outside the loop the profitability
+ // is questionable.
+ if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+ return false;
+
+ // For some CPUs result of CTLZ(X) intrinsic is undefined
+ // when X is 0. If we can not guarantee X != 0, we need to check this
+ // when expand.
+ bool ZeroCheck = false;
+ // It is safe to assume Preheader exist as it was checked in
+ // parent function RunOnLoop.
+ BasicBlock *PH = CurLoop->getLoopPreheader();
+ Value *InitX = PhiX->getIncomingValueForBlock(PH);
+ // If we check X != 0 before entering the loop we don't need a zero
+ // check in CTLZ intrinsic.
+ if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+ if (BranchInst *PreCondBr =
+ dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+ if (matchCondition(PreCondBr, PH) == InitX)
+ ZeroCheck = true;
+ }
+
+ // Check if CTLZ intrinsic is profitable. Assume it is always profitable
+ // if we delete the loop (the loop has only 6 instructions):
+ // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+ // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+ // %shr = ashr %n.addr.0, 1
+ // %tobool = icmp eq %shr, 0
+ // %inc = add nsw %i.0, 1
+ // br i1 %tobool
+
+ IRBuilder<> Builder(PH->getTerminator());
+ SmallVector<const Value *, 2> Ops =
+ {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
+ ArrayRef<const Value *> Args(Ops);
+ if (CurLoop->getHeader()->size() != 6 &&
+ TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
+ TargetTransformInfo::TCC_Basic)
+ return false;
+
+ const DebugLoc DL = DefX->getDebugLoc();
+ transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+ IsCntPhiUsedOutsideLoop);
+ return true;
+}
+
/// Recognizes a population count idiom in a non-countable loop.
///
/// If detected, transforms the relevant code to issue the popcount intrinsic
@@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
return CI;
}
+static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+ const DebugLoc &DL, bool ZeroCheck) {
+ Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+ Type *Tys[] = {Val->getType()};
+
+ Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+ Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
+ CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+ CI->setDebugLoc(DL);
+
+ return CI;
+}
+
+/// Transform the following loop:
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+// LOOP_BODY
+/// Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+/// Count = CountPrev + 1
+/// else
+/// Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// PhiCount = PHI [Count, Dec]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+/// Dec = PhiCount - 1
+/// LOOP_BODY
+/// Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+ BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
+ const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+ BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+
+ // Step 1: Insert the CTLZ instruction at the end of the preheader block
+ // Count = BitWidth - CTLZ(InitX);
+ // If there are uses of CntPhi create:
+ // CountPrev = BitWidth - CTLZ(InitX >> 1);
+ IRBuilder<> Builder(PreheaderBr);
+ Builder.SetCurrentDebugLocation(DL);
+ Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
+
+ if (IsCntPhiUsedOutsideLoop)
+ InitXNext = Builder.CreateAShr(InitX,
+ ConstantInt::get(InitX->getType(), 1));
+ else
+ InitXNext = InitX;
+ CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
+ Count = Builder.CreateSub(
+ ConstantInt::get(CTLZ->getType(),
+ CTLZ->getType()->getIntegerBitWidth()),
+ CTLZ);
+ if (IsCntPhiUsedOutsideLoop) {
+ CountPrev = Count;
+ Count = Builder.CreateAdd(
+ CountPrev,
+ ConstantInt::get(CountPrev->getType(), 1));
+ }
+ if (IsCntPhiUsedOutsideLoop)
+ NewCount = Builder.CreateZExtOrTrunc(CountPrev,
+ cast<IntegerType>(CntInst->getType()));
+ else
+ NewCount = Builder.CreateZExtOrTrunc(Count,
+ cast<IntegerType>(CntInst->getType()));
+
+ // If the CTLZ counter's initial value is not zero, insert Add Inst.
+ Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+ ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+ if (!InitConst || !InitConst->isZero())
+ NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+
+ // Step 2: Insert new IV and loop condition:
+ // loop:
+ // ...
+ // PhiCount = PHI [Count, Dec]
+ // ...
+ // Dec = PhiCount - 1
+ // ...
+ // Br: loop if (Dec != 0)
+ BasicBlock *Body = *(CurLoop->block_begin());
+ auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+ Type *Ty = Count->getType();
+
+ PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+ Builder.SetInsertPoint(LbCond);
+ Instruction *TcDec = cast<Instruction>(
+ Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+ "tcdec", false, true));
+
+ TcPhi->addIncoming(Count, Preheader);
+ TcPhi->addIncoming(TcDec, Body);
+
+ CmpInst::Predicate Pred =
+ (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+ LbCond->setPredicate(Pred);
+ LbCond->setOperand(0, TcDec);
+ LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+ // Step 3: All the references to the original counter outside
+ // the loop are replaced with the NewCount -- the value returned from
+ // __builtin_ctlz(x).
+ if (IsCntPhiUsedOutsideLoop)
+ CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+ else
+ CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+ // step 4: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+ SE->forgetLoop(CurLoop);
+}
+
void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
Instruction *CntInst,
PHINode *CntPhi, Value *Var) {
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 3c9850b156ac..5e0a705782ea 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -283,7 +283,6 @@ public:
// Forward propagation info
const Expression *getDefiningExpr() const { return DefiningExpr; }
- void setDefiningExpr(const Expression *E) { DefiningExpr = E; }
// Value member set
bool empty() const { return Members.empty(); }
@@ -317,6 +316,9 @@ public:
--StoreCount;
}
+ // True if this class has no memory members.
+ bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
// Return true if two congruence classes are equivalent to each other. This
// means
// that every field but the ID number and the dead field are equivalent.
@@ -401,9 +403,12 @@ class NewGVN {
MemorySSAWalker *MSSAWalker;
const DataLayout &DL;
std::unique_ptr<PredicateInfo> PredInfo;
- BumpPtrAllocator ExpressionAllocator;
- ArrayRecycler<Value *> ArgRecycler;
- TarjanSCC SCCFinder;
+
+ // These are the only two things the create* functions should have
+ // side-effects on due to allocating memory.
+ mutable BumpPtrAllocator ExpressionAllocator;
+ mutable ArrayRecycler<Value *> ArgRecycler;
+ mutable TarjanSCC SCCFinder;
const SimplifyQuery SQ;
// Number of function arguments, used by ranking
@@ -430,11 +435,12 @@ class NewGVN {
// In order to correctly ensure propagation, we must keep track of what
// comparisons we used, so that when the values of the comparisons change, we
// propagate the information to the places we used the comparison.
- DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers;
- // Mapping from MemoryAccess we used to the MemoryAccess we used it with. Has
+ mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+ PredicateToUsers;
// the same reasoning as PredicateToUsers. When we skip MemoryAccesses for
// stores, we no longer can rely solely on the def-use chains of MemorySSA.
- DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> MemoryToUsers;
+ mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+ MemoryToUsers;
// A table storing which memorydefs/phis represent a memory state provably
// equivalent to another memory state.
@@ -457,7 +463,7 @@ class NewGVN {
DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle };
- DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
+ mutable DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
// Expression to class mapping.
using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
ExpressionClassMap ExpressionToClass;
@@ -511,21 +517,24 @@ public:
private:
// Expression handling.
- const Expression *createExpression(Instruction *);
- const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *);
- PHIExpression *createPHIExpression(Instruction *, bool &HasBackedge,
- bool &AllConstant);
- const VariableExpression *createVariableExpression(Value *);
- const ConstantExpression *createConstantExpression(Constant *);
- const Expression *createVariableOrConstant(Value *V);
- const UnknownExpression *createUnknownExpression(Instruction *);
+ const Expression *createExpression(Instruction *) const;
+ const Expression *createBinaryExpression(unsigned, Type *, Value *,
+ Value *) const;
+ PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
+ bool &AllConstant) const;
+ const VariableExpression *createVariableExpression(Value *) const;
+ const ConstantExpression *createConstantExpression(Constant *) const;
+ const Expression *createVariableOrConstant(Value *V) const;
+ const UnknownExpression *createUnknownExpression(Instruction *) const;
const StoreExpression *createStoreExpression(StoreInst *,
- const MemoryAccess *);
+ const MemoryAccess *) const;
LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
- const MemoryAccess *);
- const CallExpression *createCallExpression(CallInst *, const MemoryAccess *);
- const AggregateValueExpression *createAggregateValueExpression(Instruction *);
- bool setBasicExpressionInfo(Instruction *, BasicExpression *);
+ const MemoryAccess *) const;
+ const CallExpression *createCallExpression(CallInst *,
+ const MemoryAccess *) const;
+ const AggregateValueExpression *
+ createAggregateValueExpression(Instruction *) const;
+ bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
// Congruence class handling.
CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
@@ -560,17 +569,18 @@ private:
// Symbolic evaluation.
const Expression *checkSimplificationResults(Expression *, Instruction *,
- Value *);
- const Expression *performSymbolicEvaluation(Value *);
+ Value *) const;
+ const Expression *performSymbolicEvaluation(Value *) const;
const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
- Instruction *, MemoryAccess *);
- const Expression *performSymbolicLoadEvaluation(Instruction *);
- const Expression *performSymbolicStoreEvaluation(Instruction *);
- const Expression *performSymbolicCallEvaluation(Instruction *);
- const Expression *performSymbolicPHIEvaluation(Instruction *);
- const Expression *performSymbolicAggrValueEvaluation(Instruction *);
- const Expression *performSymbolicCmpEvaluation(Instruction *);
- const Expression *performSymbolicPredicateInfoEvaluation(Instruction *);
+ Instruction *,
+ MemoryAccess *) const;
+ const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+ const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+ const Expression *performSymbolicCallEvaluation(Instruction *) const;
+ const Expression *performSymbolicPHIEvaluation(Instruction *) const;
+ const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+ const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+ const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
// Congruence finding.
bool someEquivalentDominates(const Instruction *, const Instruction *) const;
@@ -620,8 +630,8 @@ private:
void markPredicateUsersTouched(Instruction *);
void markValueLeaderChangeTouched(CongruenceClass *CC);
void markMemoryLeaderChangeTouched(CongruenceClass *CC);
- void addPredicateUsers(const PredicateBase *, Instruction *);
- void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U);
+ void addPredicateUsers(const PredicateBase *, Instruction *) const;
+ void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
// Main loop of value numbering
void iterateTouchedInstructions();
@@ -634,7 +644,7 @@ private:
void verifyIterationSettled(Function &F);
bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
BasicBlock *getBlockForValue(Value *V) const;
- void deleteExpression(const Expression *E);
+ void deleteExpression(const Expression *E) const;
unsigned InstrToDFSNum(const Value *V) const {
assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
return InstrDFS.lookup(V);
@@ -654,7 +664,7 @@ private:
? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
: InstrDFS.lookup(MA);
}
- bool isCycleFree(const PHINode *PN);
+ bool isCycleFree(const PHINode *PN) const;
template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
// Debug counter info. When verifying, we have to reset the value numbering
// debug counter to the same state it started in to get the same results.
@@ -702,7 +712,7 @@ BasicBlock *NewGVN::getBlockForValue(Value *V) const {
// Delete a definitely dead expression, so it can be reused by the expression
// allocator. Some of these are not in creation functions, so we have to accept
// const versions.
-void NewGVN::deleteExpression(const Expression *E) {
+void NewGVN::deleteExpression(const Expression *E) const {
assert(isa<BasicExpression>(E));
auto *BE = cast<BasicExpression>(E);
const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
@@ -710,7 +720,7 @@ void NewGVN::deleteExpression(const Expression *E) {
}
PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
- bool &AllConstant) {
+ bool &AllConstant) const {
BasicBlock *PHIBlock = I->getParent();
auto *PN = cast<PHINode>(I);
auto *E =
@@ -722,30 +732,46 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock));
+ // NewGVN assumes the operands of a PHI node are in a consistent order across
+ // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
+ // this in LLVM at some point we don't want GVN to find wrong congruences.
+ // Therefore, here we sort uses in predecessor order.
+ // We're sorting the values by pointer. In theory this might be cause of
+ // non-determinism, but here we don't rely on the ordering for anything
+ // significant, e.g. we don't create new instructions based on it so we're
+ // fine.
+ SmallVector<const Use *, 4> PHIOperands;
+ for (const Use &U : PN->operands())
+ PHIOperands.push_back(&U);
+ std::sort(PHIOperands.begin(), PHIOperands.end(),
+ [&](const Use *U1, const Use *U2) {
+ return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2);
+ });
+
// Filter out unreachable phi operands.
- auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) {
- return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock});
+ auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
+ return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock});
});
std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
- [&](const Use &U) -> Value * {
- auto *BB = PN->getIncomingBlock(U);
+ [&](const Use *U) -> Value * {
+ auto *BB = PN->getIncomingBlock(*U);
auto *DTN = DT->getNode(BB);
if (RPOOrdering.lookup(DTN) >= PHIRPO)
HasBackedge = true;
- AllConstant &= isa<UndefValue>(U) || isa<Constant>(U);
+ AllConstant &= isa<UndefValue>(*U) || isa<Constant>(*U);
// Don't try to transform self-defined phis.
- if (U == PN)
+ if (*U == PN)
return PN;
- return lookupOperandLeader(U);
+ return lookupOperandLeader(*U);
});
return E;
}
// Set basic expression info (Arguments, type, opcode) for Expression
// E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
bool AllConstant = true;
if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
E->setType(GEP->getSourceElementType());
@@ -766,7 +792,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
}
const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
- Value *Arg1, Value *Arg2) {
+ Value *Arg1,
+ Value *Arg2) const {
auto *E = new (ExpressionAllocator) BasicExpression(2);
E->setType(T);
@@ -795,7 +822,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
// TODO: Once finished, this should not take an Instruction, we only
// use it for printing.
const Expression *NewGVN::checkSimplificationResults(Expression *E,
- Instruction *I, Value *V) {
+ Instruction *I,
+ Value *V) const {
if (!V)
return nullptr;
if (auto *C = dyn_cast<Constant>(V)) {
@@ -827,7 +855,7 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
return nullptr;
}
-const Expression *NewGVN::createExpression(Instruction *I) {
+const Expression *NewGVN::createExpression(Instruction *I) const {
auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
bool AllConstant = setBasicExpressionInfo(I, E);
@@ -913,7 +941,7 @@ const Expression *NewGVN::createExpression(Instruction *I) {
}
const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I) {
+NewGVN::createAggregateValueExpression(Instruction *I) const {
if (auto *II = dyn_cast<InsertValueInst>(I)) {
auto *E = new (ExpressionAllocator)
AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
@@ -932,32 +960,32 @@ NewGVN::createAggregateValueExpression(Instruction *I) {
llvm_unreachable("Unhandled type of aggregate value operation");
}
-const VariableExpression *NewGVN::createVariableExpression(Value *V) {
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
auto *E = new (ExpressionAllocator) VariableExpression(V);
E->setOpcode(V->getValueID());
return E;
}
-const Expression *NewGVN::createVariableOrConstant(Value *V) {
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
if (auto *C = dyn_cast<Constant>(V))
return createConstantExpression(C);
return createVariableExpression(V);
}
-const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
auto *E = new (ExpressionAllocator) ConstantExpression(C);
E->setOpcode(C->getValueID());
return E;
}
-const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
auto *E = new (ExpressionAllocator) UnknownExpression(I);
E->setOpcode(I->getOpcode());
return E;
}
-const CallExpression *NewGVN::createCallExpression(CallInst *CI,
- const MemoryAccess *MA) {
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
// FIXME: Add operand bundles for calls.
auto *E =
new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
@@ -1017,9 +1045,8 @@ Value *NewGVN::lookupOperandLeader(Value *V) const {
const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
auto *CC = getMemoryClass(MA);
assert(CC->getMemoryLeader() &&
- "Every MemoryAccess should be mapped to a "
- "congruence class with a represenative memory "
- "access");
+ "Every MemoryAccess should be mapped to a congruence class with a "
+ "representative memory access");
return CC->getMemoryLeader();
}
@@ -1032,7 +1059,7 @@ bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const {
LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
LoadInst *LI,
- const MemoryAccess *MA) {
+ const MemoryAccess *MA) const {
auto *E =
new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
E->allocateOperands(ArgRecycler, ExpressionAllocator);
@@ -1050,8 +1077,8 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
return E;
}
-const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
- const MemoryAccess *MA) {
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
auto *E = new (ExpressionAllocator)
StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
@@ -1068,7 +1095,7 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
return E;
}
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
// Unlike loads, we never try to eliminate stores, so we do not check if they
// are simple and avoid value numbering them.
auto *SI = cast<StoreInst>(I);
@@ -1126,7 +1153,7 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
const Expression *
NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
LoadInst *LI, Instruction *DepInst,
- MemoryAccess *DefiningAccess) {
+ MemoryAccess *DefiningAccess) const {
assert((!LI || LI->isSimple()) && "Not a simple load");
if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
// Can't forward from non-atomic to atomic without violating memory model.
@@ -1201,7 +1228,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
return nullptr;
}
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
auto *LI = cast<LoadInst>(I);
// We can eliminate in favor of non-simple loads, but we won't be able to
@@ -1239,7 +1266,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
}
const Expression *
-NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
auto *PI = PredInfo->getPredicateInfoFor(I);
if (!PI)
return nullptr;
@@ -1284,7 +1311,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
return nullptr;
if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
- DEBUG(dbgs() << "Copy is not of any condition operands!");
+ DEBUG(dbgs() << "Copy is not of any condition operands!\n");
return nullptr;
}
Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
@@ -1329,7 +1356,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
}
// Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
auto *CI = cast<CallInst>(I);
if (auto *II = dyn_cast<IntrinsicInst>(I)) {
// Instrinsics with the returned attribute are copies of arguments.
@@ -1366,8 +1393,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
DEBUG(dbgs() << "Setting " << *From);
DEBUG(dbgs() << " equivalent to congruence class ");
DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
- DEBUG(dbgs() << *NewClass->getMemoryLeader());
- DEBUG(dbgs() << "\n");
+ DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
auto LookupResult = MemoryAccessToClass.find(From);
bool Changed = false;
@@ -1381,7 +1407,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
NewClass->memory_insert(MP);
// This may have killed the class if it had no non-memory members
if (OldClass->getMemoryLeader() == From) {
- if (OldClass->memory_empty()) {
+ if (OldClass->definesNoMemory()) {
OldClass->setMemoryLeader(nullptr);
} else {
OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
@@ -1406,7 +1432,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From,
// Determine if a phi is cycle-free. That means the values in the phi don't
// depend on any expressions that can change value as a result of the phi.
// For example, a non-cycle free phi would be v = phi(0, v+1).
-bool NewGVN::isCycleFree(const PHINode *PN) {
+bool NewGVN::isCycleFree(const PHINode *PN) const {
// In order to compute cycle-freeness, we do SCC finding on the phi, and see
// what kind of SCC it ends up in. If it is a singleton, it is cycle-free.
// If it is not in a singleton, it is only cycle free if the other members are
@@ -1436,7 +1462,7 @@ bool NewGVN::isCycleFree(const PHINode *PN) {
}
// Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
// True if one of the incoming phi edges is a backedge.
bool HasBackedge = false;
// All constant tracks the state of whether all the *original* phi operands
@@ -1510,7 +1536,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
return E;
}
-const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
+const Expression *
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
@@ -1548,7 +1575,7 @@ const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
return createAggregateValueExpression(I);
}
-const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
auto *CI = dyn_cast<CmpInst>(I);
// See if our operands are equal to those of a previous predicate, and if so,
// if it implies true or false.
@@ -1663,7 +1690,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
}
// Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V) {
+const Expression *NewGVN::performSymbolicEvaluation(Value *V) const {
const Expression *E = nullptr;
if (auto *C = dyn_cast<Constant>(V))
E = createConstantExpression(C);
@@ -1749,7 +1776,7 @@ void NewGVN::markUsersTouched(Value *V) {
}
}
-void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) {
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
MemoryToUsers[To].insert(U);
}
@@ -1772,7 +1799,7 @@ void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
}
// Add I to the set of users of a given predicate.
-void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) {
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
PredicateToUsers[PBranch->Condition].insert(I);
else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
@@ -1825,8 +1852,7 @@ const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
// TODO: If this ends up to slow, we can maintain a next memory leader like we
// do for regular leaders.
// Make sure there will be a leader to find
- assert((CC->getStoreCount() > 0 || !CC->memory_empty()) &&
- "Can't get next leader if there is none");
+ assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
if (CC->getStoreCount() > 0) {
if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
return MSSA->getMemoryAccess(NL);
@@ -1898,7 +1924,7 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
setMemoryClass(InstMA, NewClass);
// Now, fixup the old class if necessary
if (OldClass->getMemoryLeader() == InstMA) {
- if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) {
+ if (!OldClass->definesNoMemory()) {
OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
DEBUG(dbgs() << "Memory class leader change for class "
<< OldClass->getID() << " to "
@@ -1956,10 +1982,9 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
// If it's a store expression we are using, it means we are not equivalent
// to something earlier.
- if (isa<StoreExpression>(E)) {
- assert(lookupOperandLeader(SI->getValueOperand()) !=
- NewClass->getLeader());
- NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+ if (auto *SE = dyn_cast<StoreExpression>(E)) {
+ assert(SE->getStoredValue() != NewClass->getLeader());
+ NewClass->setStoredValue(SE->getStoredValue());
markValueLeaderChangeTouched(NewClass);
// Shift the new class leader to be the store
DEBUG(dbgs() << "Changing leader of congruence class "
@@ -1985,7 +2010,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// See if we destroyed the class or need to swap leaders.
if (OldClass->empty() && OldClass != TOPClass) {
if (OldClass->getDefiningExpr()) {
- DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr()
+ DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
<< " from table\n");
ExpressionToClass.erase(OldClass->getDefiningExpr());
}
@@ -2064,7 +2089,7 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
} else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
StoreInst *SI = SE->getStoreInst();
NewClass->setLeader(SI);
- NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+ NewClass->setStoredValue(SE->getStoredValue());
// The RepMemoryAccess field will be filled in properly by the
// moveValueToNewCongruenceClass call.
} else {
@@ -2523,6 +2548,19 @@ void NewGVN::verifyMemoryCongruency() const {
return false;
if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+ // We could have phi nodes which operands are all trivially dead,
+ // so we don't process them.
+ if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+ for (auto &U : MemPHI->incoming_values()) {
+ if (Instruction *I = dyn_cast<Instruction>(U.get())) {
+ if (!isInstructionTriviallyDead(I))
+ return true;
+ }
+ }
+ return false;
+ }
+
return true;
};
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index fb1b47c48276..4f608c97147d 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -55,7 +55,7 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
/// Update the dominator tree after removing one exiting predecessor of a loop
/// exit block.
static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L,
- DominatorTree &DT) {
+ DominatorTree &DT) {
assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&
"Cannot have empty predecessors of the loop exit block if we split "
"off a block to unswitch!");
@@ -137,6 +137,98 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
}
}
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+ BasicBlock &ExitBB) {
+ for (Instruction &I : ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ return true;
+
+ // If the incoming value for this edge isn't loop invariant the unswitch
+ // won't be trivial.
+ if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+ return false;
+ }
+ llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ for (Instruction &I : UnswitchedBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ break;
+
+ // When the loop exit is directly unswitched we just need to update the
+ // incoming basic block. We loop to handle weird cases with repeated
+ // incoming blocks, but expect to typically only have one operand here.
+ for (auto i : llvm::seq<int>(0, PN->getNumOperands())) {
+ assert(PN->getIncomingBlock(i) == &OldExitingBB &&
+ "Found incoming block different from unique predecessor!");
+ PN->setIncomingBlock(i, &OldPH);
+ }
+ }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+ BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ assert(&ExitBB != &UnswitchedBB &&
+ "Must have different loop exit and unswitched blocks!");
+ Instruction *InsertPt = &*UnswitchedBB.begin();
+ for (Instruction &I : ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ break;
+
+ auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2,
+ PN->getName() + ".split", InsertPt);
+
+ // Walk backwards over the old PHI node's inputs to minimize the cost of
+ // removing each one. We have to do this weird loop manually so that we
+ // create the same number of new incoming edges in the new PHI as we expect
+ // each case-based edge to be included in the unswitched switch in some
+ // cases.
+ // FIXME: This is really, really gross. It would be much cleaner if LLVM
+ // allowed us to create a single entry for a predecessor block without
+ // having separate entries for each "edge" even though these edges are
+ // required to produce identical results.
+ for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+ if (PN->getIncomingBlock(i) != &OldExitingBB)
+ continue;
+
+ Value *Incoming = PN->removeIncomingValue(i);
+ NewPN->addIncoming(Incoming, &OldPH);
+ }
+
+ // Now replace the old PHI with the new one and wire the old one in as an
+ // input to the new one.
+ PN->replaceAllUsesWith(NewPN);
+ NewPN->addIncoming(PN, &ExitBB);
+ }
+}
+
/// Unswitch a trivial branch if the condition is loop invariant.
///
/// This routine should only be called when loop code leading to the branch has
@@ -187,10 +279,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
assert(L.contains(ContinueBB) &&
"Cannot have both successors exit and still be in the loop!");
- // If the loop exit block contains phi nodes, this isn't trivial.
- // FIXME: We should examine the PHI to determine whether or not we can handle
- // it trivially.
- if (isa<PHINode>(LoopExitBB->begin()))
+ auto *ParentBB = BI.getParent();
+ if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
return false;
DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal
@@ -209,14 +299,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
BasicBlock *UnswitchedBB;
if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
(void)PredBB;
- assert(PredBB == BI.getParent() && "A branch's parent is't a predecessor!");
+ assert(PredBB == BI.getParent() &&
+ "A branch's parent isn't a predecessor!");
UnswitchedBB = LoopExitBB;
} else {
UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
}
- BasicBlock *ParentBB = BI.getParent();
-
// Now splice the branch to gate reaching the new preheader and re-point its
// successors.
OldPH->getInstList().splice(std::prev(OldPH->end()),
@@ -229,6 +318,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
// terminator.
BranchInst::Create(ContinueBB, ParentBB);
+ // Rewrite the relevant PHI nodes.
+ if (UnswitchedBB == LoopExitBB)
+ rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+ else
+ rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+ *ParentBB, *OldPH);
+
// Now we need to update the dominator tree.
updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
// But if we split something off of the loop exit block then we also removed
@@ -278,6 +374,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (!L.isLoopInvariant(LoopCond))
return false;
+ auto *ParentBB = SI.getParent();
+
// FIXME: We should compute this once at the start and update it!
SmallVector<BasicBlock *, 16> ExitBlocks;
L.getExitBlocks(ExitBlocks);
@@ -287,12 +385,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
SmallVector<int, 4> ExitCaseIndices;
for (auto Case : SI.cases()) {
auto *SuccBB = Case.getCaseSuccessor();
- if (ExitBlockSet.count(SuccBB) && !isa<PHINode>(SuccBB->begin()))
+ if (ExitBlockSet.count(SuccBB) &&
+ areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
ExitCaseIndices.push_back(Case.getCaseIndex());
}
BasicBlock *DefaultExitBB = nullptr;
if (ExitBlockSet.count(SI.getDefaultDest()) &&
- !isa<PHINode>(SI.getDefaultDest()->begin()) &&
+ areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
!isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
DefaultExitBB = SI.getDefaultDest();
else if (ExitCaseIndices.empty())
@@ -330,7 +429,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (CommonSuccBB) {
SI.setDefaultDest(CommonSuccBB);
} else {
- BasicBlock *ParentBB = SI.getParent();
BasicBlock *UnreachableBB = BasicBlock::Create(
ParentBB->getContext(),
Twine(ParentBB->getName()) + ".unreachable_default",
@@ -358,30 +456,44 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
// Now add the unswitched switch.
auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
- // Split any exit blocks with remaining in-loop predecessors. We walk in
- // reverse so that we split in the same order as the cases appeared. This is
- // purely for convenience of reading the resulting IR, but it doesn't cost
- // anything really.
+ // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+ // First, we split any exit blocks with remaining in-loop predecessors. Then
+ // we update the PHIs in one of two ways depending on if there was a split.
+ // We walk in reverse so that we split in the same order as the cases
+ // appeared. This is purely for convenience of reading the resulting IR, but
+ // it doesn't cost anything really.
+ SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
// Handle the default exit if necessary.
// FIXME: It'd be great if we could merge this with the loop below but LLVM's
// ranges aren't quite powerful enough yet.
- if (DefaultExitBB && !pred_empty(DefaultExitBB)) {
- auto *SplitBB =
- SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
- updateLoopExitIDom(DefaultExitBB, L, DT);
- DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+ if (DefaultExitBB) {
+ if (pred_empty(DefaultExitBB)) {
+ UnswitchedExitBBs.insert(DefaultExitBB);
+ rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+ } else {
+ auto *SplitBB =
+ SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
+ rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+ *ParentBB, *OldPH);
+ updateLoopExitIDom(DefaultExitBB, L, DT);
+ DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+ }
}
// Note that we must use a reference in the for loop so that we update the
// container.
for (auto &CasePair : reverse(ExitCases)) {
// Grab a reference to the exit block in the pair so that we can update it.
- BasicBlock *&ExitBB = CasePair.second;
+ BasicBlock *ExitBB = CasePair.second;
// If this case is the last edge into the exit block, we can simply reuse it
// as it will no longer be a loop exit. No mapping necessary.
- if (pred_empty(ExitBB))
+ if (pred_empty(ExitBB)) {
+ // Only rewrite once.
+ if (UnswitchedExitBBs.insert(ExitBB).second)
+ rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
continue;
+ }
// Otherwise we need to split the exit block so that we retain an exit
// block from the loop and a target for the unswitched condition.
@@ -389,9 +501,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
if (!SplitExitBB) {
// If this is the first time we see this, do the split and remember it.
SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+ rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+ *ParentBB, *OldPH);
updateLoopExitIDom(ExitBB, L, DT);
}
- ExitBB = SplitExitBB;
+ // Update the case pair to point to the split block.
+ CasePair.second = SplitExitBB;
}
// Now add the unswitched cases. We do this in reverse order as we built them
diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp
index a0fc966cee2c..a7c308b59877 100644
--- a/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -208,6 +208,47 @@ bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
return false;
}
+static unsigned ComputeSpeculationCost(const Instruction *I,
+ const TargetTransformInfo &TTI) {
+ switch (Operator::getOpcode(I)) {
+ case Instruction::GetElementPtr:
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Select:
+ case Instruction::Shl:
+ case Instruction::Sub:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::Call:
+ case Instruction::BitCast:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::AddrSpaceCast:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPExt:
+ case Instruction::FPTrunc:
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return TTI.getUserCost(I);
+
+ default:
+ return UINT_MAX; // Disallow anything not whitelisted.
+ }
+}
+
bool SpeculativeExecutionPass::considerHoistingFromTo(
BasicBlock &FromBlock, BasicBlock &ToBlock) {
SmallSet<const Instruction *, 8> NotHoisted;
@@ -223,7 +264,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
unsigned TotalSpeculationCost = 0;
for (auto& I : FromBlock) {
- const unsigned Cost = TTI->getUserCost(&I);
+ const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
AllPrecedingUsesFromBlockHoisted(&I)) {
TotalSpeculationCost += Cost;
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index 7ffdad597a9b..83ec7f55d1af 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -261,10 +261,10 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V,
computeKnownBits(V, Known, DL);
- if (Known.Zero.countLeadingOnes() >= HiBits)
+ if (Known.countMinLeadingZeros() >= HiBits)
return VALRNG_KNOWN_SHORT;
- if (Known.One.countLeadingZeros() < HiBits)
+ if (Known.countMaxLeadingZeros() < HiBits)
return VALRNG_LIKELY_LONG;
// Long integer divisions are often used in hashtable implementations. It's
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index d5124ac89016..4aa26fd14fee 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -41,6 +41,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
ValueToValueMapTy &VMap,
const Twine &NameSuffix, Function *F,
ClonedCodeInfo *CodeInfo) {
+ DenseMap<const MDNode *, MDNode *> Cache;
BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
@@ -50,6 +51,9 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
II != IE; ++II) {
Instruction *NewInst = II->clone();
+ if (F && F->getSubprogram())
+ DebugLoc::reparentDebugInfo(*NewInst, BB->getParent()->getSubprogram(),
+ F->getSubprogram(), Cache);
if (II->hasName())
NewInst->setName(II->getName()+NameSuffix);
NewBB->getInstList().push_back(NewInst);
@@ -120,12 +124,28 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
OldFunc->getAllMetadata(MDs);
- for (auto MD : MDs)
- NewFunc->addMetadata(
- MD.first,
- *MapMetadata(MD.second, VMap,
- ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
- TypeMapper, Materializer));
+ for (auto MD : MDs) {
+ MDNode *NewMD;
+ bool MustCloneSP =
+ (MD.first == LLVMContext::MD_dbg && OldFunc->getParent() &&
+ OldFunc->getParent() == NewFunc->getParent());
+ if (MustCloneSP) {
+ auto *SP = cast<DISubprogram>(MD.second);
+ NewMD = DISubprogram::getDistinct(
+ NewFunc->getContext(), SP->getScope(), SP->getName(),
+ NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(),
+ SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(),
+ SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(),
+ SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(),
+ SP->getUnit(), SP->getTemplateParams(), SP->getDeclaration(),
+ SP->getVariables(), SP->getThrownTypes());
+ } else
+ NewMD =
+ MapMetadata(MD.second, VMap,
+ ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+ TypeMapper, Materializer);
+ NewFunc->addMetadata(MD.first, *NewMD);
+ }
// Loop over all of the basic blocks in the function, cloning them as
// appropriate. Note that we save BE this way in order to handle cloning of
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 4e9d67252d6c..5444b752de82 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -96,7 +96,7 @@ std::unique_ptr<Module> llvm::CloneModule(
else
GV = new GlobalVariable(
*New, I->getValueType(), false, GlobalValue::ExternalLinkage,
- (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr,
+ nullptr, I->getName(), nullptr,
I->getThreadLocalMode(), I->getType()->getAddressSpace());
VMap[&*I] = GV;
// We do not copy attributes (mainly because copying between different
diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp
index 8c2386554da5..78d7474e5b95 100644
--- a/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -67,8 +67,7 @@ IRBuilder<> *EscapeEnumerator::Next() {
// Create a cleanup block.
LLVMContext &C = F.getContext();
BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
- Type *ExnTy =
- StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr);
+ Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));
if (!F.hasPersonalityFn()) {
Constant *PersFn = getDefaultPersonalityFn(F.getParent());
F.setPersonalityFn(PersFn);
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 6d56e08af99f..9cb4762b683c 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -1302,41 +1302,6 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
return false;
}
-/// Rebuild the entire inlined-at chain for this instruction so that the top of
-/// the chain now is inlined-at the new call site.
-static DebugLoc
-updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode,
- LLVMContext &Ctx,
- DenseMap<const DILocation *, DILocation *> &IANodes) {
- SmallVector<DILocation *, 3> InlinedAtLocations;
- DILocation *Last = InlinedAtNode;
- DILocation *CurInlinedAt = DL;
-
- // Gather all the inlined-at nodes
- while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
- // Skip any we've already built nodes for
- if (DILocation *Found = IANodes[IA]) {
- Last = Found;
- break;
- }
-
- InlinedAtLocations.push_back(IA);
- CurInlinedAt = IA;
- }
-
- // Starting from the top, rebuild the nodes to point to the new inlined-at
- // location (then rebuilding the rest of the chain behind it) and update the
- // map of already-constructed inlined-at nodes.
- for (const DILocation *MD : reverse(InlinedAtLocations)) {
- Last = IANodes[MD] = DILocation::getDistinct(
- Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
- }
-
- // And finally create the normal location for this instruction, referring to
- // the new inlined-at chain.
- return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last);
-}
-
/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
/// block. Allocas used in inalloca calls and allocas of dynamic array size
/// cannot be static.
@@ -1364,14 +1329,16 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
// Cache the inlined-at nodes as they're built so they are reused, without
// this every instruction's inlined-at chain would become distinct from each
// other.
- DenseMap<const DILocation *, DILocation *> IANodes;
+ DenseMap<const MDNode *, MDNode *> IANodes;
for (; FI != Fn->end(); ++FI) {
for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
BI != BE; ++BI) {
if (DebugLoc DL = BI->getDebugLoc()) {
- BI->setDebugLoc(
- updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
+ auto IA = DebugLoc::appendInlinedAt(DL, InlinedAtNode, BI->getContext(),
+ IANodes);
+ auto IDL = DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), IA);
+ BI->setDebugLoc(IDL);
continue;
}
@@ -1429,11 +1396,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
/// Update the branch metadata for cloned call instructions.
static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
const Optional<uint64_t> &CalleeEntryCount,
- const Instruction *TheCall) {
+ const Instruction *TheCall,
+ ProfileSummaryInfo *PSI) {
if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
return;
Optional<uint64_t> CallSiteCount =
- ProfileSummaryInfo::getProfileCount(TheCall, nullptr);
+ PSI ? PSI->getProfileCount(TheCall, nullptr) : None;
uint64_t CallCount =
std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
CalleeEntryCount.getValue());
@@ -1456,16 +1424,16 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
/// The callsite's block count is subtracted from the callee's function entry
/// count.
static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
- Instruction *CallInst, Function *Callee) {
+ Instruction *CallInst, Function *Callee,
+ ProfileSummaryInfo *PSI) {
// If the callee has a original count of N, and the estimated count of
// callsite is M, the new callee count is set to N - M. M is estimated from
// the caller's entry count, its entry block frequency and the block frequency
// of the callsite.
Optional<uint64_t> CalleeCount = Callee->getEntryCount();
- if (!CalleeCount.hasValue())
+ if (!CalleeCount.hasValue() || !PSI)
return;
- Optional<uint64_t> CallCount =
- ProfileSummaryInfo::getProfileCount(CallInst, CallerBFI);
+ Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);
if (!CallCount.hasValue())
return;
// Since CallSiteCount is an estimate, it could exceed the original callee
@@ -1668,9 +1636,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
CalledFunc->front());
- updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall);
+ updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall,
+ IFI.PSI);
// Update the profile count of callee.
- updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc);
+ updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI);
// Inject byval arguments initialization.
for (std::pair<Value*, Value*> &Init : ByValInit)
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index 8a1973d1db05..53b432fcafd4 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -26,16 +26,15 @@ namespace {
InstNamer() : FunctionPass(ID) {
initializeInstNamerPass(*PassRegistry::getPassRegistry());
}
-
+
void getAnalysisUsage(AnalysisUsage &Info) const override {
Info.setPreservesAll();
}
bool runOnFunction(Function &F) override {
- for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
- AI != AE; ++AI)
- if (!AI->hasName() && !AI->getType()->isVoidTy())
- AI->setName("arg");
+ for (auto &Arg : F.args())
+ if (!Arg.hasName())
+ Arg.setName("arg");
for (BasicBlock &BB : F) {
if (!BB.hasName())
@@ -48,11 +47,11 @@ namespace {
return true;
}
};
-
+
char InstNamer::ID = 0;
}
-INITIALIZE_PASS(InstNamer, "instnamer",
+INITIALIZE_PASS(InstNamer, "instnamer",
"Assign names to anonymous instructions", false, false)
char &llvm::InstructionNamerID = InstNamer::ID;
//===----------------------------------------------------------------------===//
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index ce6b703f3528..1ca509472b5f 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1041,7 +1041,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
KnownBits Known(BitWidth);
computeKnownBits(V, Known, DL, 0, AC, CxtI, DT);
- unsigned TrailZ = Known.Zero.countTrailingOnes();
+ unsigned TrailZ = Known.countMinTrailingZeros();
// Avoid trouble with ridiculously large TrailZ values, such as
// those computed from a null pointer.
@@ -1105,8 +1105,9 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
StoreInst *SI, DIBuilder &Builder) {
auto *DIVar = DDI->getVariable();
- auto *DIExpr = DDI->getExpression();
assert(DIVar && "Missing variable");
+ auto *DIExpr = DDI->getExpression();
+ Value *DV = SI->getOperand(0);
// If an argument is zero extended then use argument directly. The ZExt
// may be zapped by an optimization pass in future.
@@ -1116,34 +1117,28 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
if (ExtendedArg) {
- // We're now only describing a subset of the variable. The fragment we're
- // describing will always be smaller than the variable size, because
- // VariableSize == Size of Alloca described by DDI. Since SI stores
- // to the alloca described by DDI, if it's first operand is an extend,
- // we're guaranteed that before extension, the value was narrower than
- // the size of the alloca, hence the size of the described variable.
- SmallVector<uint64_t, 3> Ops;
- unsigned FragmentOffset = 0;
- // If this already is a bit fragment, we drop the bit fragment from the
- // expression and record the offset.
- auto Fragment = DIExpr->getFragmentInfo();
- if (Fragment) {
- Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
- FragmentOffset = Fragment->OffsetInBits;
- } else {
- Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+ // If this DDI was already describing only a fragment of a variable, ensure
+ // that fragment is appropriately narrowed here.
+ // But if a fragment wasn't used, describe the value as the original
+ // argument (rather than the zext or sext) so that it remains described even
+ // if the sext/zext is optimized away. This widens the variable description,
+ // leaving it up to the consumer to know how the smaller value may be
+ // represented in a larger register.
+ if (auto Fragment = DIExpr->getFragmentInfo()) {
+ unsigned FragmentOffset = Fragment->OffsetInBits;
+ SmallVector<uint64_t, 3> Ops(DIExpr->elements_begin(),
+ DIExpr->elements_end() - 3);
+ Ops.push_back(dwarf::DW_OP_LLVM_fragment);
+ Ops.push_back(FragmentOffset);
+ const DataLayout &DL = DDI->getModule()->getDataLayout();
+ Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
+ DIExpr = Builder.createExpression(Ops);
}
- Ops.push_back(dwarf::DW_OP_LLVM_fragment);
- Ops.push_back(FragmentOffset);
- const DataLayout &DL = DDI->getModule()->getDataLayout();
- Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
- auto NewDIExpr = Builder.createExpression(Ops);
- if (!LdStHasDebugValue(DIVar, NewDIExpr, SI))
- Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr,
- DDI->getDebugLoc(), SI);
- } else if (!LdStHasDebugValue(DIVar, DIExpr, SI))
- Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr,
- DDI->getDebugLoc(), SI);
+ DV = ExtendedArg;
+ }
+ if (!LdStHasDebugValue(DIVar, DIExpr, SI))
+ Builder.insertDbgValueIntrinsic(DV, 0, DIVar, DIExpr, DDI->getDebugLoc(),
+ SI);
}
/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
@@ -1781,44 +1776,43 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) {
combineMetadata(K, J, KnownIDs);
}
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
- DominatorTree &DT,
- const BasicBlockEdge &Root) {
+template <typename RootType, typename DominatesFn>
+static unsigned replaceDominatedUsesWith(Value *From, Value *To,
+ const RootType &Root,
+ const DominatesFn &Dominates) {
assert(From->getType() == To->getType());
-
+
unsigned Count = 0;
for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
- UI != UE; ) {
+ UI != UE;) {
Use &U = *UI++;
- if (DT.dominates(Root, U)) {
- U.set(To);
- DEBUG(dbgs() << "Replace dominated use of '"
- << From->getName() << "' as "
- << *To << " in " << *U << "\n");
- ++Count;
- }
+ if (!Dominates(Root, U))
+ continue;
+ U.set(To);
+ DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
+ << *To << " in " << *U << "\n");
+ ++Count;
}
return Count;
}
unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
DominatorTree &DT,
- const BasicBlock *BB) {
- assert(From->getType() == To->getType());
+ const BasicBlockEdge &Root) {
+ auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) {
+ return DT.dominates(Root, U);
+ };
+ return ::replaceDominatedUsesWith(From, To, Root, Dominates);
+}
- unsigned Count = 0;
- for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
- UI != UE;) {
- Use &U = *UI++;
- auto *I = cast<Instruction>(U.getUser());
- if (DT.properlyDominates(BB, I->getParent())) {
- U.set(To);
- DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
- << *To << " in " << *U << "\n");
- ++Count;
- }
- }
- return Count;
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+ DominatorTree &DT,
+ const BasicBlock *BB) {
+ auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) {
+ auto *I = cast<Instruction>(U.getUser())->getParent();
+ return DT.properlyDominates(BB, I);
+ };
+ return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);
}
bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 175d013a011d..81f033e7d51a 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -18,6 +18,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -1112,3 +1113,203 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
else
return (FalseVal + (TrueVal / 2)) / TrueVal;
}
+
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+ if (isa<FPMathOperator>(V)) {
+ FastMathFlags Flags;
+ Flags.setUnsafeAlgebra();
+ cast<Instruction>(V)->setFastMathFlags(Flags);
+ }
+ return V;
+}
+
+// Helper to generate a log2 shuffle reduction.
+Value *
+llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+ ArrayRef<Value *> RedOps) {
+ unsigned VF = Src->getType()->getVectorNumElements();
+ // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+ // and vector ops, reducing the set of values being computed by half each
+ // round.
+ assert(isPowerOf2_32(VF) &&
+ "Reduction emission only supported for pow2 vectors!");
+ Value *TmpVec = Src;
+ SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+ for (unsigned i = VF; i != 1; i >>= 1) {
+ // Move the upper half of the vector to the lower half.
+ for (unsigned j = 0; j != i / 2; ++j)
+ ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+
+ // Fill the rest of the mask with undef.
+ std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
+ UndefValue::get(Builder.getInt32Ty()));
+
+ Value *Shuf = Builder.CreateShuffleVector(
+ TmpVec, UndefValue::get(TmpVec->getType()),
+ ConstantVector::get(ShuffleMask), "rdx.shuf");
+
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ // Floating point operations had to be 'fast' to enable the reduction.
+ TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op,
+ TmpVec, Shuf, "bin.rdx"));
+ } else {
+ assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+ "Invalid min/max");
+ TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec,
+ Shuf);
+ }
+ if (!RedOps.empty())
+ propagateIRFlags(TmpVec, RedOps);
+ }
+ // The result is in the first element of the vector.
+ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+}
+
+/// Create a simple vector reduction specified by an opcode and some
+/// flags (if generating min/max reductions).
+Value *llvm::createSimpleTargetReduction(
+ IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
+ Value *Src, TargetTransformInfo::ReductionFlags Flags,
+ ArrayRef<Value *> RedOps) {
+ assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
+
+ Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
+ std::function<Value*()> BuildFunc;
+ using RD = RecurrenceDescriptor;
+ RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
+ // TODO: Support creating ordered reductions.
+ FastMathFlags FMFUnsafe;
+ FMFUnsafe.setUnsafeAlgebra();
+
+ switch (Opcode) {
+ case Instruction::Add:
+ BuildFunc = [&]() { return Builder.CreateAddReduce(Src); };
+ break;
+ case Instruction::Mul:
+ BuildFunc = [&]() { return Builder.CreateMulReduce(Src); };
+ break;
+ case Instruction::And:
+ BuildFunc = [&]() { return Builder.CreateAndReduce(Src); };
+ break;
+ case Instruction::Or:
+ BuildFunc = [&]() { return Builder.CreateOrReduce(Src); };
+ break;
+ case Instruction::Xor:
+ BuildFunc = [&]() { return Builder.CreateXorReduce(Src); };
+ break;
+ case Instruction::FAdd:
+ BuildFunc = [&]() {
+ auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
+ cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+ return Rdx;
+ };
+ break;
+ case Instruction::FMul:
+ BuildFunc = [&]() {
+ auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
+ cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+ return Rdx;
+ };
+ break;
+ case Instruction::ICmp:
+ if (Flags.IsMaxOp) {
+ MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax;
+ BuildFunc = [&]() {
+ return Builder.CreateIntMaxReduce(Src, Flags.IsSigned);
+ };
+ } else {
+ MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin;
+ BuildFunc = [&]() {
+ return Builder.CreateIntMinReduce(Src, Flags.IsSigned);
+ };
+ }
+ break;
+ case Instruction::FCmp:
+ if (Flags.IsMaxOp) {
+ MinMaxKind = RD::MRK_FloatMax;
+ BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); };
+ } else {
+ MinMaxKind = RD::MRK_FloatMin;
+ BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); };
+ }
+ break;
+ default:
+ llvm_unreachable("Unhandled opcode");
+ break;
+ }
+ if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
+ return BuildFunc();
+ return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
+}
+
+/// Create a vector reduction using a given recurrence descriptor.
+Value *llvm::createTargetReduction(IRBuilder<> &Builder,
+ const TargetTransformInfo *TTI,
+ RecurrenceDescriptor &Desc, Value *Src,
+ bool NoNaN) {
+ // TODO: Support in-order reductions based on the recurrence descriptor.
+ RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind();
+ TargetTransformInfo::ReductionFlags Flags;
+ Flags.NoNaN = NoNaN;
+ auto getSimpleRdx = [&](unsigned Opc) {
+ return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags);
+ };
+ switch (RecKind) {
+ case RecurrenceDescriptor::RK_FloatAdd:
+ return getSimpleRdx(Instruction::FAdd);
+ case RecurrenceDescriptor::RK_FloatMult:
+ return getSimpleRdx(Instruction::FMul);
+ case RecurrenceDescriptor::RK_IntegerAdd:
+ return getSimpleRdx(Instruction::Add);
+ case RecurrenceDescriptor::RK_IntegerMult:
+ return getSimpleRdx(Instruction::Mul);
+ case RecurrenceDescriptor::RK_IntegerAnd:
+ return getSimpleRdx(Instruction::And);
+ case RecurrenceDescriptor::RK_IntegerOr:
+ return getSimpleRdx(Instruction::Or);
+ case RecurrenceDescriptor::RK_IntegerXor:
+ return getSimpleRdx(Instruction::Xor);
+ case RecurrenceDescriptor::RK_IntegerMinMax: {
+ switch (Desc.getMinMaxRecurrenceKind()) {
+ case RecurrenceDescriptor::MRK_SIntMax:
+ Flags.IsSigned = true;
+ Flags.IsMaxOp = true;
+ break;
+ case RecurrenceDescriptor::MRK_UIntMax:
+ Flags.IsMaxOp = true;
+ break;
+ case RecurrenceDescriptor::MRK_SIntMin:
+ Flags.IsSigned = true;
+ break;
+ case RecurrenceDescriptor::MRK_UIntMin:
+ break;
+ default:
+ llvm_unreachable("Unhandled MRK");
+ }
+ return getSimpleRdx(Instruction::ICmp);
+ }
+ case RecurrenceDescriptor::RK_FloatMinMax: {
+ Flags.IsMaxOp =
+ Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax;
+ return getSimpleRdx(Instruction::FCmp);
+ }
+ default:
+ llvm_unreachable("Unhandled RecKind");
+ }
+}
+
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+ if (auto *VecOp = dyn_cast<Instruction>(I)) {
+ if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
+ // VecOVp is initialized to the 0th scalar, so start counting from index
+ // '1'.
+ VecOp->copyIRFlags(I0);
+ for (int i = 1, e = VL.size(); i < e; ++i) {
+ if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
+ VecOp->andIRFlags(Scalar);
+ }
+ }
+ }
+}
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index 29d334f2968f..2ef3d6336ae2 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -35,7 +35,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
// Upgrade a 2-field global array type to the new 3-field format if needed.
if (Data && OldEltTy->getNumElements() < 3)
EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
- IRB.getInt8PtrTy(), nullptr);
+ IRB.getInt8PtrTy());
else
EltTy = OldEltTy;
if (Constant *Init = GVCtor->getInitializer()) {
@@ -44,10 +44,10 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
for (unsigned i = 0; i != n; ++i) {
auto Ctor = cast<Constant>(Init->getOperand(i));
if (EltTy != OldEltTy)
- Ctor = ConstantStruct::get(
- EltTy, Ctor->getAggregateElement((unsigned)0),
- Ctor->getAggregateElement(1),
- Constant::getNullValue(IRB.getInt8PtrTy()), nullptr);
+ Ctor =
+ ConstantStruct::get(EltTy, Ctor->getAggregateElement((unsigned)0),
+ Ctor->getAggregateElement(1),
+ Constant::getNullValue(IRB.getInt8PtrTy()));
CurrentCtors.push_back(Ctor);
}
}
@@ -55,7 +55,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
} else {
// Use the new three-field struct if there isn't one already.
EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
- IRB.getInt8PtrTy(), nullptr);
+ IRB.getInt8PtrTy());
}
// Build a 2 or 3 field global_ctor entry. We don't take a comdat key.
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 9e71d746de34..1de579ed41b0 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1450,11 +1450,11 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
// x86_64 can't use {float, float} since that would be returned in both
// xmm0 and xmm1, which isn't what a real struct would do.
ResTy = T.getArch() == Triple::x86_64
- ? static_cast<Type *>(VectorType::get(ArgTy, 2))
- : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
+ ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+ : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
} else {
Name = "__sincospi_stret";
- ResTy = StructType::get(ArgTy, ArgTy, nullptr);
+ ResTy = StructType::get(ArgTy, ArgTy);
}
Module *M = OrigCallee->getParent();
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
index 83bd29dbca65..60d9ede2c487 100644
--- a/lib/Transforms/Utils/VNCoercion.cpp
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -303,6 +303,15 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
const DataLayout &DL) {
LLVMContext &Ctx = SrcVal->getType()->getContext();
+ // If two pointers are in the same address space, they have the same size,
+ // so we don't need to do any truncation, etc. This avoids introducing
+ // ptrtoint instructions for pointers that may be non-integral.
+ if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() &&
+ cast<PointerType>(SrcVal->getType())->getAddressSpace() ==
+ cast<PointerType>(LoadTy)->getAddressSpace()) {
+ return SrcVal;
+ }
+
uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
// Compute which bits of the stored value are being used by the load. Convert
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 84d89f103a2f..930972924c3c 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -949,11 +949,10 @@ void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
Constant *NewV;
if (IsOldCtorDtor) {
auto *S = cast<ConstantStruct>(V);
- auto *E1 = mapValue(S->getOperand(0));
- auto *E2 = mapValue(S->getOperand(1));
- Value *Null = Constant::getNullValue(VoidPtrTy);
- NewV =
- ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+ auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
+ auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
+ Constant *Null = Constant::getNullValue(VoidPtrTy);
+ NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
} else {
NewV = cast_or_null<Constant>(mapValue(V));
}
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 97dcb40a1d72..9cf66382b581 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -346,7 +346,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
if (!Safe) {
KnownBits Known(BitWidth);
computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
- if (Known.Zero.countTrailingZeros() < (BitWidth - 1))
+ if (Known.countMaxTrailingOnes() < (BitWidth - 1))
Safe = true;
}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3fde0a453962..516ab7d03a88 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -391,13 +391,14 @@ public:
TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
AddedSafetyChecks(false) {}
- // Perform the actual loop widening (vectorization).
- void vectorize() {
- // Create a new empty loop. Unlink the old loop and connect the new one.
- createEmptyLoop();
- // Widen each instruction in the old loop to a new one in the new loop.
- vectorizeLoop();
- }
+ /// Create a new empty loop. Unlink the old loop and connect the new one.
+ void createVectorizedLoopSkeleton();
+
+ /// Vectorize a single instruction within the innermost loop.
+ void vectorizeInstruction(Instruction &I);
+
+ /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+ void fixVectorizedLoop();
// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -425,9 +426,6 @@ protected:
EdgeMaskCacheTy;
typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
- /// Create an empty loop, based on the loop ranges of the old loop.
- void createEmptyLoop();
-
/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *CountRoundDown, Value *EndValue,
@@ -436,8 +434,6 @@ protected:
/// Create a new induction variable inside L.
PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
Value *Step, Instruction *DL);
- /// Copy and widen the instructions from the old loop.
- virtual void vectorizeLoop();
/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs();
@@ -450,10 +446,10 @@ protected:
/// vectorizing this phi node.
void fixReduction(PHINode *Phi);
- /// \brief The Loop exit block may have single value PHI nodes where the
- /// incoming value is 'Undef'. While vectorizing we only handled real values
- /// that were defined inside the loop. Here we fix the 'undef case'.
- /// See PR14725.
+ /// \brief The Loop exit block may have single value PHI nodes with some
+ /// incoming value. While vectorizing we only handled real values
+ /// that were defined inside the loop and we should have one value for
+ /// each predecessor of its parent basic block. See PR14725.
void fixLCSSAPHIs();
/// Iteratively sink the scalarized operands of a predicated instruction into
@@ -464,11 +460,6 @@ protected:
/// respective conditions.
void predicateInstructions();
- /// Collect the instructions from the original loop that would be trivially
- /// dead in the vectorized loop if generated.
- void collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();
@@ -481,10 +472,6 @@ protected:
/// and DST.
VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
- /// A helper function to vectorize a single instruction within the innermost
- /// loop.
- void vectorizeInstruction(Instruction &I);
-
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
@@ -1700,6 +1687,9 @@ public:
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+ // Returns true if the NoNaN attribute is set on the function.
+ bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
+
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
@@ -2185,7 +2175,10 @@ public:
/// passed Legality checks.
class LoopVectorizationPlanner {
public:
- LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {}
+ LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM)
+ : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
~LoopVectorizationPlanner() {}
@@ -2193,7 +2186,25 @@ public:
LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
unsigned UserVF);
+ /// Generate the IR code for the vectorized loop.
+ void executePlan(InnerLoopVectorizer &ILV);
+
+protected:
+ /// Collect the instructions from the original loop that would be trivially
+ /// dead in the vectorized loop if generated.
+ void collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
private:
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
/// The profitablity analysis.
LoopVectorizationCostModel &CM;
};
@@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
LVer->prepareNoAliasMetadata();
}
-void InnerLoopVectorizer::createEmptyLoop() {
+void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
@@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
}
}
-void InnerLoopVectorizer::vectorizeLoop() {
- //===------------------------------------------------===//
- //
- // Notice: any optimization or new instruction that go
- // into the code below should be also be implemented in
- // the cost-model.
- //
- //===------------------------------------------------===//
-
- // Collect instructions from the original loop that will become trivially dead
- // in the vectorized loop. We don't need to vectorize these instructions. For
- // example, original induction update instructions can become dead because we
- // separately emit induction "steps" when generating code for the new loop.
- // Similarly, we create a new latch condition when setting up the structure
- // of the new loop, so the old one can become dead.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- collectTriviallyDeadInstructions(DeadInstructions);
-
- // Scan the loop in a topological order to ensure that defs are vectorized
- // before users.
- LoopBlocksDFS DFS(OrigLoop);
- DFS.perform(LI);
-
- // Vectorize all instructions in the original loop that will not become
- // trivially dead when vectorized.
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
- for (Instruction &I : *BB)
- if (!DeadInstructions.count(&I))
- vectorizeInstruction(I);
-
+void InnerLoopVectorizer::fixVectorizedLoop() {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
if (VF > 1)
@@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// Set the insertion point after the previous value if it is an instruction.
// Note that the previous value may have been constant-folded so it is not
- // guaranteed to be an instruction in the vector loop.
- if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]))
+ // guaranteed to be an instruction in the vector loop. Also, if the previous
+ // value is a phi node, we should insert after all the phi nodes to avoid
+ // breaking basic block verification.
+ if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) ||
+ isa<PHINode>(PreviousParts[UF - 1]))
Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
else
Builder.SetInsertPoint(
@@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
if (VF > 1) {
- // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
- // and vector ops, reducing the set of values being computed by half each
- // round.
- assert(isPowerOf2_32(VF) &&
- "Reduction emission only supported for pow2 vectors!");
- Value *TmpVec = ReducedPartRdx;
- SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
- for (unsigned i = VF; i != 1; i >>= 1) {
- // Move the upper half of the vector to the lower half.
- for (unsigned j = 0; j != i / 2; ++j)
- ShuffleMask[j] = Builder.getInt32(i / 2 + j);
-
- // Fill the rest of the mask with undef.
- std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
- UndefValue::get(Builder.getInt32Ty()));
-
- Value *Shuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()),
- ConstantVector::get(ShuffleMask), "rdx.shuf");
-
- if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- // Floating point operations had to be 'fast' to enable the reduction.
- TmpVec = addFastMathFlag(Builder.CreateBinOp(
- (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
- else
- TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
- TmpVec, Shuf);
- }
-
- // The result is in the first element of the vector.
+ bool NoNaN = Legal->hasFunNoNaNAttr();
ReducedPartRdx =
- Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-
+ createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (Phi->getType() != RdxDesc.getRecurrenceType())
@@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
if (!LCSSAPhi)
break;
- if (LCSSAPhi->getNumIncomingValues() == 1)
- LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
- LoopMiddleBlock);
- }
-}
-
-void InnerLoopVectorizer::collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions) {
- BasicBlock *Latch = OrigLoop->getLoopLatch();
-
- // We create new control-flow for the vectorized loop, so the original
- // condition will be dead after vectorization if it's only used by the
- // branch.
- auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
- if (Cmp && Cmp->hasOneUse())
- DeadInstructions.insert(Cmp);
-
- // We create new "steps" for induction variable updates to which the original
- // induction variables map. An original update instruction will be dead if
- // all its users except the induction variable are dead.
- for (auto &Induction : *Legal->getInductionVars()) {
- PHINode *Ind = Induction.first;
- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
- if (all_of(IndUpdate->users(), [&](User *U) -> bool {
- return U == Ind || DeadInstructions.count(cast<Instruction>(U));
- }))
- DeadInstructions.insert(IndUpdate);
+ if (LCSSAPhi->getNumIncomingValues() == 1) {
+ assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
+ "Incoming value isn't loop invariant");
+ LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
+ }
}
}
@@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
return CM.selectVectorizationFactor(MaxVF);
}
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+ // Perform the actual loop transformation.
+
+ // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+ ILV.createVectorizedLoopSkeleton();
+
+ //===------------------------------------------------===//
+ //
+ // Notice: any optimization or new instruction that go
+ // into the code below should also be implemented in
+ // the cost-model.
+ //
+ //===------------------------------------------------===//
+
+ // 2. Copy and widen instructions from the old loop into the new loop.
+
+ // Collect instructions from the original loop that will become trivially dead
+ // in the vectorized loop. We don't need to vectorize these instructions. For
+ // example, original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
+ collectTriviallyDeadInstructions(DeadInstructions);
+
+ // Scan the loop in a topological order to ensure that defs are vectorized
+ // before users.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
+
+ // Vectorize all instructions in the original loop that will not become
+ // trivially dead when vectorized.
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+ for (Instruction &I : *BB)
+ if (!DeadInstructions.count(&I))
+ ILV.vectorizeInstruction(I);
+
+ // 3. Fix the vectorized code: take care of header phi's, live-outs,
+ // predication, updating analyses.
+ ILV.fixVectorizedLoop();
+}
+
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+ // We create new control-flow for the vectorized loop, so the original
+ // condition will be dead after vectorization if it's only used by the
+ // branch.
+ auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+ if (Cmp && Cmp->hasOneUse())
+ DeadInstructions.insert(Cmp);
+
+ // We create new "steps" for induction variable updates to which the original
+ // induction variables map. An original update instruction will be dead if
+ // all its users except the induction variable are dead.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ PHINode *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+ if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+ return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+ }))
+ DeadInstructions.insert(IndUpdate);
+ }
+}
+
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
auto *SI = dyn_cast<StoreInst>(Instr);
bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
@@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectValuesToIgnore();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(CM);
+ LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();
@@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM);
- Unroller.vectorize();
+ LVP.executePlan(Unroller);
ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
L->getHeader())
@@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// If we decided that it is *legal* to vectorize the loop, then do it.
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
&LVL, &CM);
- LB.vectorize();
+ LVP.executePlan(LB);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there are
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f112c555205c..64013d6d687d 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -40,7 +40,9 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <memory>
@@ -212,23 +214,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
return Opcode;
}
-/// Get the intersection (logical and) of all of the potential IR flags
-/// of each scalar operation (VL) that will be converted into a vector (I).
-/// Flag set: NSW, NUW, exact, and all of fast-math.
-static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
- if (auto *VecOp = dyn_cast<Instruction>(I)) {
- if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
- // VecOVp is initialized to the 0th scalar, so start counting from index
- // '1'.
- VecOp->copyIRFlags(I0);
- for (int i = 1, e = VL.size(); i < e; ++i) {
- if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
- VecOp->andIRFlags(Scalar);
- }
- }
- }
-}
-
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
@@ -315,10 +300,10 @@ public:
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
- const DataLayout *DL)
+ const DataLayout *DL, OptimizationRemarkEmitter *ORE)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
- DL(DL), Builder(Se->getContext()) {
+ DL(DL), ORE(ORE), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
@@ -331,7 +316,10 @@ public:
else
MaxVecRegSize = TTI->getRegisterBitWidth(true);
- MinVecRegSize = MinVectorRegSizeOption;
+ if (MinVectorRegSizeOption.getNumOccurrences())
+ MinVecRegSize = MinVectorRegSizeOption;
+ else
+ MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
}
/// \brief Vectorize the tree that starts with the elements in \p VL.
@@ -377,6 +365,8 @@ public:
MinBWs.clear();
}
+ unsigned getTreeSize() const { return VectorizableTree.size(); }
+
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
@@ -415,6 +405,8 @@ public:
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable();
+ OptimizationRemarkEmitter *getORE() { return ORE; }
+
private:
struct TreeEntry;
@@ -944,6 +936,8 @@ private:
AssumptionCache *AC;
DemandedBits *DB;
const DataLayout *DL;
+ OptimizationRemarkEmitter *ORE;
+
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
/// Instruction builder to construct the vectorized tree.
@@ -1835,11 +1829,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
- int ScalarCost = VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
- Op2VK, Op1VP, Op2VP);
+ SmallVector<const Value *, 4> Operands(VL0->operand_values());
+ int ScalarCost =
+ VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+ Op2VP, Operands);
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
- Op1VP, Op2VP);
+ Op1VP, Op2VP, Operands);
return VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
@@ -3703,10 +3699,8 @@ void BoUpSLP::computeMinimumValueSizes() {
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
- bool KnownZero = false;
- bool KnownOne = false;
- ComputeSignBit(R, KnownZero, KnownOne, *DL);
- return KnownZero;
+ KnownBits Known = computeKnownBits(R, *DL);
+ return Known.isNonNegative();
});
// Determine the maximum number of bits required to store the scalar
@@ -3786,8 +3780,9 @@ struct SLPVectorizer : public FunctionPass {
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+ return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3799,6 +3794,7 @@ struct SLPVectorizer : public FunctionPass {
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
@@ -3817,8 +3813,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+ auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+ bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
if (!Changed)
return PreservedAnalyses::all();
@@ -3833,7 +3830,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
LoopInfo *LI_, DominatorTree *DT_,
- AssumptionCache *AC_, DemandedBits *DB_) {
+ AssumptionCache *AC_, DemandedBits *DB_,
+ OptimizationRemarkEmitter *ORE_) {
SE = SE_;
TTI = TTI_;
TLI = TLI_;
@@ -3861,7 +3859,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
- BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
@@ -3950,6 +3948,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ using namespace ore;
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+ cast<StoreInst>(Chain[i]))
+ << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+ << " and with tree size "
+ << NV("TreeSize", R.getTreeSize()));
+
R.vectorizeTree();
// Move to the next bundle.
@@ -4163,6 +4168,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+ cast<Instruction>(Ops[0]))
+ << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+ << " and with tree size "
+ << ore::NV("TreeSize", R.getTreeSize()));
+
Value *VectorizedRoot = R.vectorizeTree();
// Reconstruct the build vector by extracting the vectorized root. This
@@ -4506,6 +4517,12 @@ public:
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
<< ". (HorRdx)\n");
+ auto *I0 = cast<Instruction>(VL[0]);
+ V.getORE()->emit(
+ OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize()));
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
@@ -4513,7 +4530,7 @@ public:
// Emit a reduction.
Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
@@ -4583,33 +4600,31 @@ private:
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
- unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
+ unsigned ReduxWidth, ArrayRef<Value *> RedOps,
+ const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
+ if (!IsPairwiseReduction)
+ return createSimpleTargetReduction(
+ Builder, TTI, ReductionOpcode, VectorizedValue,
+ TargetTransformInfo::ReductionFlags(), RedOps);
+
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
- if (IsPairwiseReduction) {
- Value *LeftMask =
+ Value *LeftMask =
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
- Value *RightMask =
+ Value *RightMask =
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
- Value *LeftShuf = Builder.CreateShuffleVector(
+ Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
- Value *RightShuf = Builder.CreateShuffleVector(
+ Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
- TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,
- "bin.rdx");
- } else {
- Value *UpperHalf =
- createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
- Value *Shuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
- TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
- }
+ TmpVec =
+ Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
propagateIRFlags(TmpVec, RedOps);
}
@@ -5162,6 +5177,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
namespace llvm {
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index d2984697c8a9..6677063f944f 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -115,6 +115,7 @@ struct FDRState {
uint16_t CPUId;
uint16_t ThreadId;
uint64_t BaseTSC;
+
/// Encode some of the state transitions for the FDR log reader as explicit
/// checks. These are expectations for the next Record in the stream.
enum class Token {
@@ -123,8 +124,10 @@ struct FDRState {
NEW_CPU_ID_RECORD,
FUNCTION_SEQUENCE,
SCAN_TO_END_OF_THREAD_BUF,
+ CUSTOM_EVENT_DATA,
};
Token Expects;
+
// Each threads buffer may have trailing garbage to scan over, so we track our
// progress.
uint64_t CurrentBufferSize;
@@ -143,6 +146,8 @@ Twine fdrStateToTwine(const FDRState::Token &state) {
return "FUNCTION_SEQUENCE";
case FDRState::Token::SCAN_TO_END_OF_THREAD_BUF:
return "SCAN_TO_END_OF_THREAD_BUF";
+ case FDRState::Token::CUSTOM_EVENT_DATA:
+ return "CUSTOM_EVENT_DATA";
}
return "UNKNOWN";
}
@@ -212,13 +217,32 @@ Error processFDRWallTimeRecord(FDRState &State, uint8_t RecordFirstByte,
return Error::success();
}
+/// State transition when a CustomEventMarker is encountered.
+Error processCustomEventMarker(FDRState &State, uint8_t RecordFirstByte,
+ DataExtractor &RecordExtractor,
+ size_t &RecordSize) {
+ // We can encounter a CustomEventMarker anywhere in the log, so we can handle
+ // it regardless of the expectation. However, we do se the expectation to read
+ // a set number of fixed bytes, as described in the metadata.
+ uint32_t OffsetPtr = 1; // Read after the first byte.
+ uint32_t DataSize = RecordExtractor.getU32(&OffsetPtr);
+ uint64_t TSC = RecordExtractor.getU64(&OffsetPtr);
+
+ // FIXME: Actually represent the record through the API. For now we only skip
+ // through the data.
+ (void)TSC;
+ RecordSize = 16 + DataSize;
+ return Error::success();
+}
+
/// Advances the state machine for reading the FDR record type by reading one
/// Metadata Record and updating the State appropriately based on the kind of
/// record encountered. The RecordKind is encoded in the first byte of the
/// Record, which the caller should pass in because they have already read it
/// to determine that this is a metadata record as opposed to a function record.
Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
- DataExtractor &RecordExtractor) {
+ DataExtractor &RecordExtractor,
+ size_t &RecordSize) {
// The remaining 7 bits are the RecordKind enum.
uint8_t RecordKind = RecordFirstByte >> 1;
switch (RecordKind) {
@@ -247,6 +271,11 @@ Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
processFDRWallTimeRecord(State, RecordFirstByte, RecordExtractor))
return E;
break;
+ case 5: // CustomEventMarker
+ if (auto E = processCustomEventMarker(State, RecordFirstByte,
+ RecordExtractor, RecordSize))
+ return E;
+ break;
default:
// Widen the record type to uint16_t to prevent conversion to char.
return make_error<StringError>(
@@ -400,7 +429,8 @@ Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
bool isMetadataRecord = BitField & 0x01uL;
if (isMetadataRecord) {
RecordSize = 16;
- if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor))
+ if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor,
+ RecordSize))
return E;
State.CurrentBufferConsumed += RecordSize;
} else { // Process Function Record
diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index 79d8fc7df99b..9102efbdcb46 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt
@@ -22,7 +22,9 @@ endforeach(entry)
if(${LLVM_BUILD_RUNTIME})
# MSVC isn't quite working with libc++ yet, disable it until issues are
# fixed.
- if(NOT MSVC)
+ # FIXME: LLVM_FORCE_BUILD_RUNTIME is currently used by libc++ to force
+ # enable the in-tree build when targeting clang-cl.
+ if(NOT MSVC OR LLVM_FORCE_BUILD_RUNTIME)
# Add the projects in reverse order of their dependencies so that the
# dependent projects can see the target names of their dependencies.
add_llvm_external_project(libunwind)
diff --git a/test/Analysis/BasicAA/cs-cs-arm.ll b/test/Analysis/BasicAA/cs-cs-arm.ll
new file mode 100644
index 000000000000..1580af9ea826
--- /dev/null
+++ b/test/Analysis/BasicAA/cs-cs-arm.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+; REQUIRES: arm
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "arm-apple-ios"
+
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+
+define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
+entry:
+ %q = getelementptr i8, i8* %p, i64 16
+ %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
+ call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+ %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
+ %c = add <8 x i16> %a, %b
+ ret <8 x i16> %c
+
+; CHECK-LABEL: Function: test1:
+
+; CHECK: NoAlias: i8* %p, i8* %q
+; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
+; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+}
diff --git a/test/Analysis/BasicAA/cs-cs.ll b/test/Analysis/BasicAA/cs-cs.ll
index 0f74dbd92bbd..870794c25165 100644
--- a/test/Analysis/BasicAA/cs-cs.ll
+++ b/test/Analysis/BasicAA/cs-cs.ll
@@ -2,41 +2,12 @@
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
target triple = "arm-apple-ios"
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
-declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
-
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
declare void @a_readonly_func(i8 *) noinline nounwind readonly
declare void @a_writeonly_func(i8 *) noinline nounwind writeonly
-define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
-entry:
- %q = getelementptr i8, i8* %p, i64 16
- %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
- call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
- %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
- %c = add <8 x i16> %a, %b
- ret <8 x i16> %c
-
-; CHECK-LABEL: Function: test1:
-
-; CHECK: NoAlias: i8* %p, i8* %q
-; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16)
-; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-}
-
define void @test2(i8* %P, i8* %Q) nounwind ssp {
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
@@ -247,9 +218,9 @@ define void @test7(i8* %P) nounwind ssp {
; CHECK: Just Ref: call void @a_readonly_func(i8* %P) <-> call void @a_writeonly_func(i8* %P)
}
-declare void @an_inaccessiblememonly_func() nounwind inaccessiblememonly
-declare void @an_inaccessibleorargmemonly_func(i8 *) nounwind inaccessiblemem_or_argmemonly
-declare void @an_argmemonly_func(i8 *) nounwind argmemonly
+declare void @an_inaccessiblememonly_func() nounwind inaccessiblememonly
+declare void @an_inaccessibleorargmemonly_func(i8 *) nounwind inaccessiblemem_or_argmemonly
+declare void @an_argmemonly_func(i8 *) nounwind argmemonly
define void @test8(i8* %p) {
entry:
@@ -260,7 +231,7 @@ entry:
call void @an_inaccessiblememonly_func()
call void @an_inaccessibleorargmemonly_func(i8* %q)
call void @an_argmemonly_func(i8* %q)
- ret void
+ ret void
; CHECK-LABEL: Function: test8
; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessiblememonly_func()
diff --git a/test/Analysis/BasicAA/intrinsics-arm.ll b/test/Analysis/BasicAA/intrinsics-arm.ll
new file mode 100644
index 000000000000..e15ce1c65c64
--- /dev/null
+++ b/test/Analysis/BasicAA/intrinsics-arm.ll
@@ -0,0 +1,31 @@
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+; REQUIRES: arm
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+
+; BasicAA should prove that these calls don't interfere, since we've
+; specifically special cased exactly these two intrinsics in
+; MemoryLocation::getForArgument.
+
+; CHECK: define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %q = getelementptr i8, i8* %p, i64 16
+; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR:#[0-9]+]]
+; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK-NEXT: %c = add <8 x i16> %a, %a
+define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
+entry:
+ %q = getelementptr i8, i8* %p, i64 16
+ %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
+ call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+ %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
+ %c = add <8 x i16> %a, %b
+ ret <8 x i16> %c
+}
+
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+
+; CHECK: attributes #0 = { argmemonly nounwind readonly }
+; CHECK: attributes #1 = { argmemonly nounwind }
+; CHECK: attributes [[ATTR]] = { nounwind }
diff --git a/test/Analysis/BasicAA/intrinsics.ll b/test/Analysis/BasicAA/intrinsics.ll
index 526a039ef7ac..68e59862bcc1 100644
--- a/test/Analysis/BasicAA/intrinsics.ll
+++ b/test/Analysis/BasicAA/intrinsics.ll
@@ -5,38 +5,22 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
; BasicAA should prove that these calls don't interfere, since they are
; IntrArgReadMem and have noalias pointers.
-; CHECK: define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
+; CHECK: define <8 x i16> @test0(<8 x i16>* noalias %p, <8 x i16>* noalias %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR:#[0-9]+]]
-; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK-NEXT: %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) [[ATTR:#[0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m)
; CHECK-NEXT: %c = add <8 x i16> %a, %a
-define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
+define <8 x i16> @test0(<8 x i16>* noalias %p, <8 x i16>* noalias %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
entry:
- %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
- call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
- %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
+ %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind
+ call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m)
+ %b = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind
%c = add <8 x i16> %a, %b
ret <8 x i16> %c
}
-; CHECK: define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: %q = getelementptr i8, i8* %p, i64 16
-; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR]]
-; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
-; CHECK-NEXT: %c = add <8 x i16> %a, %a
-define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
-entry:
- %q = getelementptr i8, i8* %p, i64 16
- %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
- call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
- %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind
- %c = add <8 x i16> %a, %b
- ret <8 x i16> %c
-}
-
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
-declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) nounwind readonly
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) nounwind
; CHECK: attributes #0 = { argmemonly nounwind readonly }
; CHECK: attributes #1 = { argmemonly nounwind }
diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll
index 84936b7761ca..7bee1bd57373 100644
--- a/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/test/Analysis/BranchProbabilityInfo/basic.ll
@@ -452,7 +452,7 @@ entry:
i32 3, label %case_d
i32 4, label %case_e ], !prof !8
; CHECK: edge entry -> case_a probability is 0x00000800 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_b probability is 0x07fffe01 / 0x80000000 = 6.25%
+; CHECK: edge entry -> case_b probability is 0x07fffdff / 0x80000000 = 6.25%
; CHECK: edge entry -> case_c probability is 0x67fffdff / 0x80000000 = 81.25% [HOT edge]
; CHECK: edge entry -> case_d probability is 0x07fffdff / 0x80000000 = 6.25%
; CHECK: edge entry -> case_e probability is 0x07fffdff / 0x80000000 = 6.25%
@@ -495,7 +495,7 @@ entry:
i32 4, label %case_e ], !prof !9
; CHECK: edge entry -> case_a probability is 0x00000400 / 0x80000000 = 0.00%
; CHECK: edge entry -> case_b probability is 0x00000400 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_c probability is 0x6aaaa800 / 0x80000000 = 83.33% [HOT edge]
+; CHECK: edge entry -> case_c probability is 0x6aaaa7ff / 0x80000000 = 83.33% [HOT edge]
; CHECK: edge entry -> case_d probability is 0x0aaaa7ff / 0x80000000 = 8.33%
; CHECK: edge entry -> case_e probability is 0x0aaaa7ff / 0x80000000 = 8.33%
@@ -535,7 +535,7 @@ entry:
i32 4, label %case_e ], !prof !10
; CHECK: edge entry -> case_a probability is 0x00000000 / 0x80000000 = 0.00%
; CHECK: edge entry -> case_b probability is 0x00000400 / 0x80000000 = 0.00%
-; CHECK: edge entry -> case_c probability is 0x6e08fa2e / 0x80000000 = 85.96% [HOT edge]
+; CHECK: edge entry -> case_c probability is 0x6e08fa2d / 0x80000000 = 85.96% [HOT edge]
; CHECK: edge entry -> case_d probability is 0x08fb80e9 / 0x80000000 = 7.02%
; CHECK: edge entry -> case_e probability is 0x08fb80e9 / 0x80000000 = 7.02%
diff --git a/test/Analysis/CostModel/AArch64/free-widening-casts.ll b/test/Analysis/CostModel/AArch64/free-widening-casts.ll
new file mode 100644
index 000000000000..07f32d1d8ba2
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/free-widening-casts.ll
@@ -0,0 +1,622 @@
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -cost-model -analyze | FileCheck %s --check-prefix=COST
+; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE
+
+; COST-LABEL: uaddl_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: uaddl_8h
+; CODE: uaddl v0.8h, v0.8b, v1.8b
+define <8 x i16> @uaddl_8h(<8 x i8> %a, <8 x i8> %b) {
+ %tmp0 = zext <8 x i8> %a to <8 x i16>
+ %tmp1 = zext <8 x i8> %b to <8 x i16>
+ %tmp2 = add <8 x i16> %tmp0, %tmp1
+ ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: uaddl_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: uaddl_4s
+; CODE: uaddl v0.4s, v0.4h, v1.4h
+define <4 x i32> @uaddl_4s(<4 x i16> %a, <4 x i16> %b) {
+ %tmp0 = zext <4 x i16> %a to <4 x i32>
+ %tmp1 = zext <4 x i16> %b to <4 x i32>
+ %tmp2 = add <4 x i32> %tmp0, %tmp1
+ ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: uaddl_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: uaddl_2d
+; CODE: uaddl v0.2d, v0.2s, v1.2s
+define <2 x i64> @uaddl_2d(<2 x i32> %a, <2 x i32> %b) {
+ %tmp0 = zext <2 x i32> %a to <2 x i64>
+ %tmp1 = zext <2 x i32> %b to <2 x i64>
+ %tmp2 = add <2 x i64> %tmp0, %tmp1
+ ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: uaddl2_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: uaddl2_8h
+; CODE: uaddl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT: uaddl v0.8h, v0.8b, v1.8b
+define <16 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) {
+ %tmp0 = zext <16 x i8> %a to <16 x i16>
+ %tmp1 = zext <16 x i8> %b to <16 x i16>
+ %tmp2 = add <16 x i16> %tmp0, %tmp1
+ ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: uaddl2_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: uaddl2_4s
+; CODE: uaddl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT: uaddl v0.4s, v0.4h, v1.4h
+define <8 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) {
+ %tmp0 = zext <8 x i16> %a to <8 x i32>
+ %tmp1 = zext <8 x i16> %b to <8 x i32>
+ %tmp2 = add <8 x i32> %tmp0, %tmp1
+ ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: uaddl2_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: uaddl2_2d
+; CODE: uaddl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT: uaddl v0.2d, v0.2s, v1.2s
+define <4 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) {
+ %tmp0 = zext <4 x i32> %a to <4 x i64>
+ %tmp1 = zext <4 x i32> %b to <4 x i64>
+ %tmp2 = add <4 x i64> %tmp0, %tmp1
+ ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: saddl_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: saddl_8h
+; CODE: saddl v0.8h, v0.8b, v1.8b
+define <8 x i16> @saddl_8h(<8 x i8> %a, <8 x i8> %b) {
+ %tmp0 = sext <8 x i8> %a to <8 x i16>
+ %tmp1 = sext <8 x i8> %b to <8 x i16>
+ %tmp2 = add <8 x i16> %tmp0, %tmp1
+ ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: saddl_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: saddl_4s
+; CODE: saddl v0.4s, v0.4h, v1.4h
+define <4 x i32> @saddl_4s(<4 x i16> %a, <4 x i16> %b) {
+ %tmp0 = sext <4 x i16> %a to <4 x i32>
+ %tmp1 = sext <4 x i16> %b to <4 x i32>
+ %tmp2 = add <4 x i32> %tmp0, %tmp1
+ ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: saddl_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: saddl_2d
+; CODE: saddl v0.2d, v0.2s, v1.2s
+define <2 x i64> @saddl_2d(<2 x i32> %a, <2 x i32> %b) {
+ %tmp0 = sext <2 x i32> %a to <2 x i64>
+ %tmp1 = sext <2 x i32> %b to <2 x i64>
+ %tmp2 = add <2 x i64> %tmp0, %tmp1
+ ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: saddl2_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: saddl2_8h
+; CODE: saddl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT: saddl v0.8h, v0.8b, v1.8b
+define <16 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) {
+ %tmp0 = sext <16 x i8> %a to <16 x i16>
+ %tmp1 = sext <16 x i8> %b to <16 x i16>
+ %tmp2 = add <16 x i16> %tmp0, %tmp1
+ ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: saddl2_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: saddl2_4s
+; CODE: saddl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT: saddl v0.4s, v0.4h, v1.4h
+define <8 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) {
+ %tmp0 = sext <8 x i16> %a to <8 x i32>
+ %tmp1 = sext <8 x i16> %b to <8 x i32>
+ %tmp2 = add <8 x i32> %tmp0, %tmp1
+ ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: saddl2_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: saddl2_2d
+; CODE: saddl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT: saddl v0.2d, v0.2s, v1.2s
+define <4 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) {
+ %tmp0 = sext <4 x i32> %a to <4 x i64>
+ %tmp1 = sext <4 x i32> %b to <4 x i64>
+ %tmp2 = add <4 x i64> %tmp0, %tmp1
+ ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: usubl_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: usubl_8h
+; CODE: usubl v0.8h, v0.8b, v1.8b
+define <8 x i16> @usubl_8h(<8 x i8> %a, <8 x i8> %b) {
+ %tmp0 = zext <8 x i8> %a to <8 x i16>
+ %tmp1 = zext <8 x i8> %b to <8 x i16>
+ %tmp2 = sub <8 x i16> %tmp0, %tmp1
+ ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: usubl_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: usubl_4s
+; CODE: usubl v0.4s, v0.4h, v1.4h
+define <4 x i32> @usubl_4s(<4 x i16> %a, <4 x i16> %b) {
+ %tmp0 = zext <4 x i16> %a to <4 x i32>
+ %tmp1 = zext <4 x i16> %b to <4 x i32>
+ %tmp2 = sub <4 x i32> %tmp0, %tmp1
+ ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: usubl_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: usubl_2d
+; CODE: usubl v0.2d, v0.2s, v1.2s
+define <2 x i64> @usubl_2d(<2 x i32> %a, <2 x i32> %b) {
+ %tmp0 = zext <2 x i32> %a to <2 x i64>
+ %tmp1 = zext <2 x i32> %b to <2 x i64>
+ %tmp2 = sub <2 x i64> %tmp0, %tmp1
+ ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: usubl2_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: usubl2_8h
+; CODE: usubl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT: usubl v0.8h, v0.8b, v1.8b
+define <16 x i16> @usubl2_8h(<16 x i8> %a, <16 x i8> %b) {
+ %tmp0 = zext <16 x i8> %a to <16 x i16>
+ %tmp1 = zext <16 x i8> %b to <16 x i16>
+ %tmp2 = sub <16 x i16> %tmp0, %tmp1
+ ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: usubl2_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: usubl2_4s
+; CODE: usubl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT: usubl v0.4s, v0.4h, v1.4h
+define <8 x i32> @usubl2_4s(<8 x i16> %a, <8 x i16> %b) {
+ %tmp0 = zext <8 x i16> %a to <8 x i32>
+ %tmp1 = zext <8 x i16> %b to <8 x i32>
+ %tmp2 = sub <8 x i32> %tmp0, %tmp1
+ ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: usubl2_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: usubl2_2d
+; CODE: usubl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT: usubl v0.2d, v0.2s, v1.2s
+define <4 x i64> @usubl2_2d(<4 x i32> %a, <4 x i32> %b) {
+ %tmp0 = zext <4 x i32> %a to <4 x i64>
+ %tmp1 = zext <4 x i32> %b to <4 x i64>
+ %tmp2 = sub <4 x i64> %tmp0, %tmp1
+ ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: ssubl_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16>
+; CODE-LABEL: ssubl_8h
+; CODE: ssubl v0.8h, v0.8b, v1.8b
+define <8 x i16> @ssubl_8h(<8 x i8> %a, <8 x i8> %b) {
+ %tmp0 = sext <8 x i8> %a to <8 x i16>
+ %tmp1 = sext <8 x i8> %b to <8 x i16>
+ %tmp2 = sub <8 x i16> %tmp0, %tmp1
+ ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: ssubl_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32>
+; CODE-LABEL: ssubl_4s
+; CODE: ssubl v0.4s, v0.4h, v1.4h
+define <4 x i32> @ssubl_4s(<4 x i16> %a, <4 x i16> %b) {
+ %tmp0 = sext <4 x i16> %a to <4 x i32>
+ %tmp1 = sext <4 x i16> %b to <4 x i32>
+ %tmp2 = sub <4 x i32> %tmp0, %tmp1
+ ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: ssubl_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64>
+; CODE-LABEL: ssubl_2d
+; CODE: ssubl v0.2d, v0.2s, v1.2s
+define <2 x i64> @ssubl_2d(<2 x i32> %a, <2 x i32> %b) {
+ %tmp0 = sext <2 x i32> %a to <2 x i64>
+ %tmp1 = sext <2 x i32> %b to <2 x i64>
+ %tmp2 = sub <2 x i64> %tmp0, %tmp1
+ ret <2 x i64> %tmp2
+}
+
+; COST-LABEL: ssubl2_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16>
+; CODE-LABEL: ssubl2_8h
+; CODE: ssubl2 v2.8h, v0.16b, v1.16b
+; CODE-NEXT: ssubl v0.8h, v0.8b, v1.8b
+define <16 x i16> @ssubl2_8h(<16 x i8> %a, <16 x i8> %b) {
+ %tmp0 = sext <16 x i8> %a to <16 x i16>
+ %tmp1 = sext <16 x i8> %b to <16 x i16>
+ %tmp2 = sub <16 x i16> %tmp0, %tmp1
+ ret <16 x i16> %tmp2
+}
+
+; COST-LABEL: ssubl2_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32>
+; CODE-LABEL: ssubl2_4s
+; CODE: ssubl2 v2.4s, v0.8h, v1.8h
+; CODE-NEXT: ssubl v0.4s, v0.4h, v1.4h
+define <8 x i32> @ssubl2_4s(<8 x i16> %a, <8 x i16> %b) {
+ %tmp0 = sext <8 x i16> %a to <8 x i32>
+ %tmp1 = sext <8 x i16> %b to <8 x i32>
+ %tmp2 = sub <8 x i32> %tmp0, %tmp1
+ ret <8 x i32> %tmp2
+}
+
+; COST-LABEL: ssubl2_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64>
+; CODE-LABEL: ssubl2_2d
+; CODE: ssubl2 v2.2d, v0.4s, v1.4s
+; CODE-NEXT: ssubl v0.2d, v0.2s, v1.2s
+define <4 x i64> @ssubl2_2d(<4 x i32> %a, <4 x i32> %b) {
+ %tmp0 = sext <4 x i32> %a to <4 x i64>
+ %tmp1 = sext <4 x i32> %b to <4 x i64>
+ %tmp2 = sub <4 x i64> %tmp0, %tmp1
+ ret <4 x i64> %tmp2
+}
+
+; COST-LABEL: uaddw_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: uaddw_8h
+; CODE: uaddw v0.8h, v1.8h, v0.8b
+define <8 x i16> @uaddw_8h(<8 x i8> %a, <8 x i16> %b) {
+ %tmp0 = zext <8 x i8> %a to <8 x i16>
+ %tmp1 = add <8 x i16> %b, %tmp0
+ ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: uaddw_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: uaddw_4s
+; CODE: uaddw v0.4s, v1.4s, v0.4h
+define <4 x i32> @uaddw_4s(<4 x i16> %a, <4 x i32> %b) {
+ %tmp0 = zext <4 x i16> %a to <4 x i32>
+ %tmp1 = add <4 x i32> %b, %tmp0
+ ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: uaddw_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: uaddw_2d
+; CODE: uaddw v0.2d, v1.2d, v0.2s
+define <2 x i64> @uaddw_2d(<2 x i32> %a, <2 x i64> %b) {
+ %tmp0 = zext <2 x i32> %a to <2 x i64>
+ %tmp1 = add <2 x i64> %b, %tmp0
+ ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: uaddw2_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: uaddw2_8h
+; CODE: uaddw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT: uaddw v0.8h, v1.8h, v0.8b
+define <16 x i16> @uaddw2_8h(<16 x i8> %a, <16 x i16> %b) {
+ %tmp0 = zext <16 x i8> %a to <16 x i16>
+ %tmp1 = add <16 x i16> %b, %tmp0
+ ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: uaddw2_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: uaddw2_4s
+; CODE: uaddw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT: uaddw v0.4s, v1.4s, v0.4h
+define <8 x i32> @uaddw2_4s(<8 x i16> %a, <8 x i32> %b) {
+ %tmp0 = zext <8 x i16> %a to <8 x i32>
+ %tmp1 = add <8 x i32> %b, %tmp0
+ ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: uaddw2_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: uaddw2_2d
+; CODE: uaddw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT: uaddw v0.2d, v1.2d, v0.2s
+define <4 x i64> @uaddw2_2d(<4 x i32> %a, <4 x i64> %b) {
+ %tmp0 = zext <4 x i32> %a to <4 x i64>
+ %tmp1 = add <4 x i64> %b, %tmp0
+ ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: saddw_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: saddw_8h
+; CODE: saddw v0.8h, v1.8h, v0.8b
+define <8 x i16> @saddw_8h(<8 x i8> %a, <8 x i16> %b) {
+ %tmp0 = sext <8 x i8> %a to <8 x i16>
+ %tmp1 = add <8 x i16> %b, %tmp0
+ ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: saddw_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: saddw_4s
+; CODE: saddw v0.4s, v1.4s, v0.4h
+define <4 x i32> @saddw_4s(<4 x i16> %a, <4 x i32> %b) {
+ %tmp0 = sext <4 x i16> %a to <4 x i32>
+ %tmp1 = add <4 x i32> %b, %tmp0
+ ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: saddw_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: saddw_2d
+; CODE: saddw v0.2d, v1.2d, v0.2s
+define <2 x i64> @saddw_2d(<2 x i32> %a, <2 x i64> %b) {
+ %tmp0 = sext <2 x i32> %a to <2 x i64>
+ %tmp1 = add <2 x i64> %b, %tmp0
+ ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: saddw2_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: saddw2_8h
+; CODE: saddw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT: saddw v0.8h, v1.8h, v0.8b
+define <16 x i16> @saddw2_8h(<16 x i8> %a, <16 x i16> %b) {
+ %tmp0 = sext <16 x i8> %a to <16 x i16>
+ %tmp1 = add <16 x i16> %b, %tmp0
+ ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: saddw2_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: saddw2_4s
+; CODE: saddw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT: saddw v0.4s, v1.4s, v0.4h
+define <8 x i32> @saddw2_4s(<8 x i16> %a, <8 x i32> %b) {
+ %tmp0 = sext <8 x i16> %a to <8 x i32>
+ %tmp1 = add <8 x i32> %b, %tmp0
+ ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: saddw2_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: saddw2_2d
+; CODE: saddw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT: saddw v0.2d, v1.2d, v0.2s
+define <4 x i64> @saddw2_2d(<4 x i32> %a, <4 x i64> %b) {
+ %tmp0 = sext <4 x i32> %a to <4 x i64>
+ %tmp1 = add <4 x i64> %b, %tmp0
+ ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: usubw_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: usubw_8h
+; CODE: usubw v0.8h, v1.8h, v0.8b
+define <8 x i16> @usubw_8h(<8 x i8> %a, <8 x i16> %b) {
+ %tmp0 = zext <8 x i8> %a to <8 x i16>
+ %tmp1 = sub <8 x i16> %b, %tmp0
+ ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: usubw_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: usubw_4s
+; CODE: usubw v0.4s, v1.4s, v0.4h
+define <4 x i32> @usubw_4s(<4 x i16> %a, <4 x i32> %b) {
+ %tmp0 = zext <4 x i16> %a to <4 x i32>
+ %tmp1 = sub <4 x i32> %b, %tmp0
+ ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: usubw_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: usubw_2d
+; CODE: usubw v0.2d, v1.2d, v0.2s
+define <2 x i64> @usubw_2d(<2 x i32> %a, <2 x i64> %b) {
+ %tmp0 = zext <2 x i32> %a to <2 x i64>
+ %tmp1 = sub <2 x i64> %b, %tmp0
+ ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: usubw2_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: usubw2_8h
+; CODE: usubw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT: usubw v0.8h, v1.8h, v0.8b
+define <16 x i16> @usubw2_8h(<16 x i8> %a, <16 x i16> %b) {
+ %tmp0 = zext <16 x i8> %a to <16 x i16>
+ %tmp1 = sub <16 x i16> %b, %tmp0
+ ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: usubw2_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: usubw2_4s
+; CODE: usubw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT: usubw v0.4s, v1.4s, v0.4h
+define <8 x i32> @usubw2_4s(<8 x i16> %a, <8 x i32> %b) {
+ %tmp0 = zext <8 x i16> %a to <8 x i32>
+ %tmp1 = sub <8 x i32> %b, %tmp0
+ ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: usubw2_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: usubw2_2d
+; CODE: usubw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT: usubw v0.2d, v1.2d, v0.2s
+define <4 x i64> @usubw2_2d(<4 x i32> %a, <4 x i64> %b) {
+ %tmp0 = zext <4 x i32> %a to <4 x i64>
+ %tmp1 = sub <4 x i64> %b, %tmp0
+ ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: ssubw_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; CODE-LABEL: ssubw_8h
+; CODE: ssubw v0.8h, v1.8h, v0.8b
+define <8 x i16> @ssubw_8h(<8 x i8> %a, <8 x i16> %b) {
+ %tmp0 = sext <8 x i8> %a to <8 x i16>
+ %tmp1 = sub <8 x i16> %b, %tmp0
+ ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: ssubw_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; CODE-LABEL: ssubw_4s
+; CODE: ssubw v0.4s, v1.4s, v0.4h
+define <4 x i32> @ssubw_4s(<4 x i16> %a, <4 x i32> %b) {
+ %tmp0 = sext <4 x i16> %a to <4 x i32>
+ %tmp1 = sub <4 x i32> %b, %tmp0
+ ret <4 x i32> %tmp1
+}
+
+; COST-LABEL: ssubw_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; CODE-LABEL: ssubw_2d
+; CODE: ssubw v0.2d, v1.2d, v0.2s
+define <2 x i64> @ssubw_2d(<2 x i32> %a, <2 x i64> %b) {
+ %tmp0 = sext <2 x i32> %a to <2 x i64>
+ %tmp1 = sub <2 x i64> %b, %tmp0
+ ret <2 x i64> %tmp1
+}
+
+; COST-LABEL: ssubw2_8h
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; CODE-LABEL: ssubw2_8h
+; CODE: ssubw2 v2.8h, v2.8h, v0.16b
+; CODE-NEXT: ssubw v0.8h, v1.8h, v0.8b
+define <16 x i16> @ssubw2_8h(<16 x i8> %a, <16 x i16> %b) {
+ %tmp0 = sext <16 x i8> %a to <16 x i16>
+ %tmp1 = sub <16 x i16> %b, %tmp0
+ ret <16 x i16> %tmp1
+}
+
+; COST-LABEL: ssubw2_4s
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; CODE-LABEL: ssubw2_4s
+; CODE: ssubw2 v2.4s, v2.4s, v0.8h
+; CODE-NEXT: ssubw v0.4s, v1.4s, v0.4h
+define <8 x i32> @ssubw2_4s(<8 x i16> %a, <8 x i32> %b) {
+ %tmp0 = sext <8 x i16> %a to <8 x i32>
+ %tmp1 = sub <8 x i32> %b, %tmp0
+ ret <8 x i32> %tmp1
+}
+
+; COST-LABEL: ssubw2_2d
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; CODE-LABEL: ssubw2_2d
+; CODE: ssubw2 v2.2d, v2.2d, v0.4s
+; CODE-NEXT: ssubw v0.2d, v1.2d, v0.2s
+define <4 x i64> @ssubw2_2d(<4 x i32> %a, <4 x i64> %b) {
+ %tmp0 = sext <4 x i32> %a to <4 x i64>
+ %tmp1 = sub <4 x i64> %b, %tmp0
+ ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: neg_wrong_operand_order
+; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+define <8 x i16> @neg_wrong_operand_order(<8 x i8> %a, <8 x i16> %b) {
+ %tmp0 = zext <8 x i8> %a to <8 x i16>
+ %tmp1 = sub <8 x i16> %tmp0, %b
+ ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: neg_non_widening_op
+; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+define <8 x i16> @neg_non_widening_op(<8 x i8> %a, <8 x i16> %b) {
+ %tmp0 = zext <8 x i8> %a to <8 x i16>
+ %tmp1 = udiv <8 x i16> %b, %tmp0
+ ret <8 x i16> %tmp1
+}
+
+; COST-LABEL: neg_dissimilar_operand_kind_0
+; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+define <8 x i16> @neg_dissimilar_operand_kind_0(<8 x i8> %a, <8 x i8> %b) {
+ %tmp0 = sext <8 x i8> %a to <8 x i16>
+ %tmp1 = zext <8 x i8> %b to <8 x i16>
+ %tmp2 = add <8 x i16> %tmp0, %tmp1
+ ret <8 x i16> %tmp2
+}
+
+; COST-LABEL: neg_dissimilar_operand_kind_1
+; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <4 x i8> %a to <4 x i32>
+; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+define <4 x i32> @neg_dissimilar_operand_kind_1(<4 x i8> %a, <4 x i16> %b) {
+ %tmp0 = zext <4 x i8> %a to <4 x i32>
+ %tmp1 = zext <4 x i16> %b to <4 x i32>
+ %tmp2 = add <4 x i32> %tmp0, %tmp1
+ ret <4 x i32> %tmp2
+}
+
+; COST-LABEL: neg_illegal_vector_type_0
+; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <16 x i4> %a to <16 x i8>
+define <16 x i8> @neg_illegal_vector_type_0(<16 x i4> %a, <16 x i8> %b) {
+ %tmp0 = zext <16 x i4> %a to <16 x i8>
+ %tmp1 = sub <16 x i8> %b, %tmp0
+ ret <16 x i8> %tmp1
+}
+
+; COST-LABEL: neg_llegal_vector_type_1
+; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <1 x i16> %a to <1 x i32>
+define <1 x i32> @neg_llegal_vector_type_1(<1 x i16> %a, <1 x i32> %b) {
+ %tmp0 = zext <1 x i16> %a to <1 x i32>
+ %tmp1 = add <1 x i32> %b, %tmp0
+ ret <1 x i32> %tmp1
+}
+
+; COST-LABEL: neg_llegal_vector_type_2
+; COST-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i64>
+define <4 x i64> @neg_llegal_vector_type_2(<4 x i16> %a, <4 x i64> %b) {
+ %tmp0 = zext <4 x i16> %a to <4 x i64>
+ %tmp1 = add <4 x i64> %b, %tmp0
+ ret <4 x i64> %tmp1
+}
+
+; COST-LABEL: neg_llegal_vector_type_3
+; COST-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68>
+define <3 x i68> @neg_llegal_vector_type_3(<3 x i34> %a, <3 x i68> %b) {
+ %tmp0 = zext <3 x i34> %a to <3 x i68>
+ %tmp1 = add <3 x i68> %b, %tmp0
+ ret <3 x i68> %tmp1
+}
diff --git a/test/Analysis/CostModel/AMDGPU/extractelement.ll b/test/Analysis/CostModel/AMDGPU/extractelement.ll
index 1efbb5873acb..54c8b6c52365 100644
--- a/test/Analysis/CostModel/AMDGPU/extractelement.ll
+++ b/test/Analysis/CostModel/AMDGPU/extractelement.ll
@@ -1,7 +1,9 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; CHECK: 'extractelement_v2i32'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
+; GCN: 'extractelement_v2i32'
+; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i32>
define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
%elt = extractelement <2 x i32> %vec, i32 1
@@ -9,8 +11,8 @@ define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32
ret void
}
-; CHECK: 'extractelement_v2f32'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
+; GCN: 'extractelement_v2f32'
+; GCN: estimated cost of 0 for {{.*}} extractelement <2 x float>
define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
%elt = extractelement <2 x float> %vec, i32 1
@@ -18,8 +20,8 @@ define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x f
ret void
}
-; CHECK: 'extractelement_v3i32'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
+; GCN: 'extractelement_v3i32'
+; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i32>
define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
%elt = extractelement <3 x i32> %vec, i32 1
@@ -27,8 +29,8 @@ define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32
ret void
}
-; CHECK: 'extractelement_v4i32'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
+; GCN: 'extractelement_v4i32'
+; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i32>
define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
%vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
%elt = extractelement <4 x i32> %vec, i32 1
@@ -36,8 +38,8 @@ define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32
ret void
}
-; CHECK: 'extractelement_v8i32'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
+; GCN: 'extractelement_v8i32'
+; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32>
define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
%vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
%elt = extractelement <8 x i32> %vec, i32 1
@@ -46,8 +48,8 @@ define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32
}
; FIXME: Should be non-0
-; CHECK: 'extractelement_v8i32_dynindex'
-; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
+; GCN: 'extractelement_v8i32_dynindex'
+; GCN: estimated cost of 2 for {{.*}} extractelement <8 x i32>
define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
%vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
%elt = extractelement <8 x i32> %vec, i32 %idx
@@ -55,8 +57,8 @@ define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out,
ret void
}
-; CHECK: 'extractelement_v2i64'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
+; GCN: 'extractelement_v2i64'
+; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i64>
define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
%elt = extractelement <2 x i64> %vec, i64 1
@@ -64,8 +66,8 @@ define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64
ret void
}
-; CHECK: 'extractelement_v3i64'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
+; GCN: 'extractelement_v3i64'
+; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i64>
define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
%vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
%elt = extractelement <3 x i64> %vec, i64 1
@@ -73,8 +75,8 @@ define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64
ret void
}
-; CHECK: 'extractelement_v4i64'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
+; GCN: 'extractelement_v4i64'
+; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i64>
define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
%vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
%elt = extractelement <4 x i64> %vec, i64 1
@@ -82,8 +84,8 @@ define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64
ret void
}
-; CHECK: 'extractelement_v8i64'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
+; GCN: 'extractelement_v8i64'
+; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i64>
define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
%vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
%elt = extractelement <8 x i64> %vec, i64 1
@@ -91,8 +93,8 @@ define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64
ret void
}
-; CHECK: 'extractelement_v4i8'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
+; GCN: 'extractelement_v4i8'
+; GCN: estimated cost of 1 for {{.*}} extractelement <4 x i8>
define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
%vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
%elt = extractelement <4 x i8> %vec, i8 1
@@ -100,11 +102,31 @@ define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> a
ret void
}
-; CHECK: 'extractelement_v2i16'
-; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
-define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+; GCN: 'extractelement_0_v2i16':
+; CI: estimated cost of 1 for {{.*}} extractelement <2 x i16> %vec, i16 0
+; VI: estimated cost of 0 for {{.*}} extractelement <2 x i16>
+; GFX9: estimated cost of 0 for {{.*}} extractelement <2 x i16>
+define amdgpu_kernel void @extractelement_0_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+ %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+ %elt = extractelement <2 x i16> %vec, i16 0
+ store i16 %elt, i16 addrspace(1)* %out
+ ret void
+}
+
+; GCN: 'extractelement_1_v2i16':
+; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
+define amdgpu_kernel void @extractelement_1_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
%elt = extractelement <2 x i16> %vec, i16 1
store i16 %elt, i16 addrspace(1)* %out
ret void
}
+
+; GCN: 'extractelement_var_v2i16'
+; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
+define amdgpu_kernel void @extractelement_var_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, i32 %idx) {
+ %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+ %elt = extractelement <2 x i16> %vec, i32 %idx
+ store i16 %elt, i16 addrspace(1)* %out
+ ret void
+}
diff --git a/test/Analysis/CostModel/AMDGPU/insertelement.ll b/test/Analysis/CostModel/AMDGPU/insertelement.ll
index 6f296a3e7a34..67ab2607acd5 100644
--- a/test/Analysis/CostModel/AMDGPU/insertelement.ll
+++ b/test/Analysis/CostModel/AMDGPU/insertelement.ll
@@ -1,37 +1,50 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; CHECK: 'insertelement_v2i32'
-; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32>
+; GCN-LABEL: 'insertelement_v2i32'
+; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i32>
define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
- %insert = insertelement <2 x i32> %vec, i32 1, i32 123
+ %insert = insertelement <2 x i32> %vec, i32 123, i32 1
store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
ret void
}
-; CHECK: 'insertelement_v2i64'
-; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64>
+; GCN-LABEL: 'insertelement_v2i64'
+; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i64>
define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
- %insert = insertelement <2 x i64> %vec, i64 1, i64 123
+ %insert = insertelement <2 x i64> %vec, i64 123, i64 1
store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
ret void
}
-; CHECK: 'insertelement_v2i16'
-; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16>
-define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+; GCN-LABEL: 'insertelement_0_v2i16'
+; CI: estimated cost of 1 for {{.*}} insertelement <2 x i16>
+; VI: estimated cost of 0 for {{.*}} insertelement <2 x i16>
+; GFX9: estimated cost of 0 for {{.*}} insertelement <2 x i16>
+define amdgpu_kernel void @insertelement_0_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
- %insert = insertelement <2 x i16> %vec, i16 1, i16 123
+ %insert = insertelement <2 x i16> %vec, i16 123, i16 0
store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
ret void
}
-; CHECK: 'insertelement_v2i8'
-; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8>
-define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
+; GCN-LABEL: 'insertelement_1_v2i16'
+; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i16>
+define amdgpu_kernel void @insertelement_1_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+ %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+ %insert = insertelement <2 x i16> %vec, i16 123, i16 1
+ store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: 'insertelement_1_v2i8'
+; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i8>
+define amdgpu_kernel void @insertelement_1_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
%vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
- %insert = insertelement <2 x i8> %vec, i8 1, i8 123
+ %insert = insertelement <2 x i8> %vec, i8 123, i8 1
store <2 x i8> %insert, <2 x i8> addrspace(1)* %out
ret void
}
diff --git a/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/test/Analysis/CostModel/AMDGPU/shufflevector.ll
new file mode 100644
index 000000000000..cc756c82fed3
--- /dev/null
+++ b/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -0,0 +1,43 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX9,GCN %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=VI,GCN %s
+
+; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
+define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+ %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
+ store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
+define amdgpu_kernel void @shufflevector_01_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+ %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
+ store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+define amdgpu_kernel void @shufflevector_10_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+ %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+ store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+define amdgpu_kernel void @shufflevector_11_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+ %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+ store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+; GCN: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+define amdgpu_kernel void @shufflevector_02_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr0, <2 x i16> addrspace(1)* %vaddr1) {
+ %vec0 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr0
+ %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr1
+ %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+ store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
+ ret void
+}
diff --git a/test/Analysis/CostModel/X86/div.ll b/test/Analysis/CostModel/X86/div.ll
index 0ac06ff75ebe..dabaaef3596a 100644
--- a/test/Analysis/CostModel/X86/div.ll
+++ b/test/Analysis/CostModel/X86/div.ll
@@ -139,14 +139,14 @@ define i32 @sdiv_uniformconst() {
; SSE2: cost of 38 {{.*}} %V8i32 = sdiv
; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv
; SSE42: cost of 30 {{.*}} %V8i32 = sdiv
- ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv
+ ; AVX1: cost of 32 {{.*}} %V8i32 = sdiv
; AVX2: cost of 15 {{.*}} %V8i32 = sdiv
; AVX512: cost of 15 {{.*}} %V8i32 = sdiv
%V8i32 = sdiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
; SSE2: cost of 76 {{.*}} %V16i32 = sdiv
; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv
; SSE42: cost of 60 {{.*}} %V16i32 = sdiv
- ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv
+ ; AVX1: cost of 64 {{.*}} %V16i32 = sdiv
; AVX2: cost of 30 {{.*}} %V16i32 = sdiv
; AVX512: cost of 15 {{.*}} %V16i32 = sdiv
%V16i32 = sdiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
@@ -157,12 +157,12 @@ define i32 @sdiv_uniformconst() {
; AVX: cost of 6 {{.*}} %V8i16 = sdiv
%V8i16 = sdiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
; SSE: cost of 12 {{.*}} %V16i16 = sdiv
- ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv
+ ; AVX1: cost of 14 {{.*}} %V16i16 = sdiv
; AVX2: cost of 6 {{.*}} %V16i16 = sdiv
; AVX512: cost of 6 {{.*}} %V16i16 = sdiv
%V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
; SSE: cost of 24 {{.*}} %V32i16 = sdiv
- ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv
+ ; AVX1: cost of 28 {{.*}} %V32i16 = sdiv
; AVX2: cost of 12 {{.*}} %V32i16 = sdiv
; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv
; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv
@@ -203,12 +203,12 @@ define i32 @udiv_uniformconst() {
; AVX: cost of 15 {{.*}} %V4i32 = udiv
%V4i32 = udiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
; SSE: cost of 30 {{.*}} %V8i32 = udiv
- ; AVX1: cost of 30 {{.*}} %V8i32 = udiv
+ ; AVX1: cost of 32 {{.*}} %V8i32 = udiv
; AVX2: cost of 15 {{.*}} %V8i32 = udiv
; AVX512: cost of 15 {{.*}} %V8i32 = udiv
%V8i32 = udiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
; SSE: cost of 60 {{.*}} %V16i32 = udiv
- ; AVX1: cost of 60 {{.*}} %V16i32 = udiv
+ ; AVX1: cost of 64 {{.*}} %V16i32 = udiv
; AVX2: cost of 30 {{.*}} %V16i32 = udiv
; AVX512: cost of 15 {{.*}} %V16i32 = udiv
%V16i32 = udiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
@@ -219,12 +219,12 @@ define i32 @udiv_uniformconst() {
; AVX: cost of 6 {{.*}} %V8i16 = udiv
%V8i16 = udiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
; SSE: cost of 12 {{.*}} %V16i16 = udiv
- ; AVX1: cost of 12 {{.*}} %V16i16 = udiv
+ ; AVX1: cost of 14 {{.*}} %V16i16 = udiv
; AVX2: cost of 6 {{.*}} %V16i16 = udiv
; AVX512: cost of 6 {{.*}} %V16i16 = udiv
%V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
; SSE: cost of 24 {{.*}} %V32i16 = udiv
- ; AVX1: cost of 24 {{.*}} %V32i16 = udiv
+ ; AVX1: cost of 28 {{.*}} %V32i16 = udiv
; AVX2: cost of 12 {{.*}} %V32i16 = udiv
; AVX512F: cost of 12 {{.*}} %V32i16 = udiv
; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv
@@ -269,14 +269,14 @@ define i32 @sdiv_uniformconstpow2() {
; SSE2: cost of 38 {{.*}} %V8i32 = sdiv
; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv
; SSE42: cost of 30 {{.*}} %V8i32 = sdiv
- ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv
+ ; AVX1: cost of 32 {{.*}} %V8i32 = sdiv
; AVX2: cost of 15 {{.*}} %V8i32 = sdiv
; AVX512: cost of 15 {{.*}} %V8i32 = sdiv
%V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
; SSE2: cost of 76 {{.*}} %V16i32 = sdiv
; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv
; SSE42: cost of 60 {{.*}} %V16i32 = sdiv
- ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv
+ ; AVX1: cost of 64 {{.*}} %V16i32 = sdiv
; AVX2: cost of 30 {{.*}} %V16i32 = sdiv
; AVX512: cost of 15 {{.*}} %V16i32 = sdiv
%V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -287,12 +287,12 @@ define i32 @sdiv_uniformconstpow2() {
; AVX: cost of 6 {{.*}} %V8i16 = sdiv
%V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE: cost of 12 {{.*}} %V16i16 = sdiv
- ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv
+ ; AVX1: cost of 14 {{.*}} %V16i16 = sdiv
; AVX2: cost of 6 {{.*}} %V16i16 = sdiv
; AVX512: cost of 6 {{.*}} %V16i16 = sdiv
%V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE: cost of 24 {{.*}} %V32i16 = sdiv
- ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv
+ ; AVX1: cost of 28 {{.*}} %V32i16 = sdiv
; AVX2: cost of 12 {{.*}} %V32i16 = sdiv
; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv
; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv
@@ -333,12 +333,12 @@ define i32 @udiv_uniformconstpow2() {
; AVX: cost of 15 {{.*}} %V4i32 = udiv
%V4i32 = udiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
; SSE: cost of 30 {{.*}} %V8i32 = udiv
- ; AVX1: cost of 30 {{.*}} %V8i32 = udiv
+ ; AVX1: cost of 32 {{.*}} %V8i32 = udiv
; AVX2: cost of 15 {{.*}} %V8i32 = udiv
; AVX512: cost of 15 {{.*}} %V8i32 = udiv
%V8i32 = udiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
; SSE: cost of 60 {{.*}} %V16i32 = udiv
- ; AVX1: cost of 60 {{.*}} %V16i32 = udiv
+ ; AVX1: cost of 64 {{.*}} %V16i32 = udiv
; AVX2: cost of 30 {{.*}} %V16i32 = udiv
; AVX512: cost of 15 {{.*}} %V16i32 = udiv
%V16i32 = udiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -349,12 +349,12 @@ define i32 @udiv_uniformconstpow2() {
; AVX: cost of 6 {{.*}} %V8i16 = udiv
%V8i16 = udiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE: cost of 12 {{.*}} %V16i16 = udiv
- ; AVX1: cost of 12 {{.*}} %V16i16 = udiv
+ ; AVX1: cost of 14 {{.*}} %V16i16 = udiv
; AVX2: cost of 6 {{.*}} %V16i16 = udiv
; AVX512: cost of 6 {{.*}} %V16i16 = udiv
%V16i16 = udiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
; SSE: cost of 24 {{.*}} %V32i16 = udiv
- ; AVX1: cost of 24 {{.*}} %V32i16 = udiv
+ ; AVX1: cost of 28 {{.*}} %V32i16 = udiv
; AVX2: cost of 12 {{.*}} %V32i16 = udiv
; AVX512F: cost of 12 {{.*}} %V32i16 = udiv
; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index a23b13fb2e25..eabc2330ddc6 100644
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -33,10 +33,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i64':
; SSE2: Found an estimated cost of 24 for instruction: %shift
; SSE41: Found an estimated cost of 24 for instruction: %shift
-; AVX: Found an estimated cost of 24 for instruction: %shift
+; AVX: Found an estimated cost of 26 for instruction: %shift
; AVX2: Found an estimated cost of 4 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%shift = ashr <4 x i64> %a, %b
ret <4 x i64> %shift
}
@@ -45,10 +45,10 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
; SSE2: Found an estimated cost of 48 for instruction: %shift
; SSE41: Found an estimated cost of 48 for instruction: %shift
-; AVX: Found an estimated cost of 48 for instruction: %shift
+; AVX: Found an estimated cost of 52 for instruction: %shift
; AVX2: Found an estimated cost of 8 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%shift = ashr <8 x i64> %a, %b
ret <8 x i64> %shift
}
@@ -70,10 +70,10 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
; SSE2: Found an estimated cost of 32 for instruction: %shift
; SSE41: Found an estimated cost of 24 for instruction: %shift
-; AVX: Found an estimated cost of 24 for instruction: %shift
+; AVX: Found an estimated cost of 26 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = ashr <8 x i32> %a, %b
ret <8 x i32> %shift
@@ -83,10 +83,10 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
; SSE2: Found an estimated cost of 64 for instruction: %shift
; SSE41: Found an estimated cost of 48 for instruction: %shift
-; AVX: Found an estimated cost of 48 for instruction: %shift
+; AVX: Found an estimated cost of 52 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = ashr <16 x i32> %a, %b
ret <16 x i32> %shift
@@ -109,11 +109,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16':
; SSE2: Found an estimated cost of 64 for instruction: %shift
; SSE41: Found an estimated cost of 28 for instruction: %shift
-; AVX: Found an estimated cost of 28 for instruction: %shift
+; AVX: Found an estimated cost of 30 for instruction: %shift
; AVX2: Found an estimated cost of 10 for instruction: %shift
; AVX512F: Found an estimated cost of 10 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%shift = ashr <16 x i16> %a, %b
ret <16 x i16> %shift
}
@@ -122,11 +122,11 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
; SSE2: Found an estimated cost of 128 for instruction: %shift
; SSE41: Found an estimated cost of 56 for instruction: %shift
-; AVX: Found an estimated cost of 56 for instruction: %shift
+; AVX: Found an estimated cost of 60 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%shift = ashr <32 x i16> %a, %b
ret <32 x i16> %shift
}
@@ -147,11 +147,11 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8':
; SSE2: Found an estimated cost of 108 for instruction: %shift
; SSE41: Found an estimated cost of 48 for instruction: %shift
-; AVX: Found an estimated cost of 48 for instruction: %shift
+; AVX: Found an estimated cost of 50 for instruction: %shift
; AVX2: Found an estimated cost of 24 for instruction: %shift
; AVX512F: Found an estimated cost of 24 for instruction: %shift
; AVX512BW: Found an estimated cost of 24 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%shift = ashr <32 x i8> %a, %b
ret <32 x i8> %shift
}
@@ -160,11 +160,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
; SSE2: Found an estimated cost of 216 for instruction: %shift
; SSE41: Found an estimated cost of 96 for instruction: %shift
-; AVX: Found an estimated cost of 96 for instruction: %shift
+; AVX: Found an estimated cost of 100 for instruction: %shift
; AVX2: Found an estimated cost of 48 for instruction: %shift
; AVX512F: Found an estimated cost of 48 for instruction: %shift
; AVX512BW: Found an estimated cost of 24 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
}
@@ -191,11 +191,10 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i64':
; SSE2: Found an estimated cost of 8 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
-; AVX: Found an estimated cost of 8 for instruction: %shift
-; AVX2: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 10 for instruction: %shift
+; AVX2: Found an estimated cost of 4 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
-; XOPAVX2: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%insert = insertelement <4 x i64> undef, i64 %b, i32 0
%splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
%shift = ashr <4 x i64> %a, %splat
@@ -206,11 +205,10 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
; SSE2: Found an estimated cost of 16 for instruction: %shift
; SSE41: Found an estimated cost of 16 for instruction: %shift
-; AVX: Found an estimated cost of 16 for instruction: %shift
-; AVX2: Found an estimated cost of 16 for instruction: %shift
+; AVX: Found an estimated cost of 20 for instruction: %shift
+; AVX2: Found an estimated cost of 8 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
-; XOPAVX2: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%insert = insertelement <8 x i64> undef, i64 %b, i32 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
%shift = ashr <8 x i64> %a, %splat
@@ -235,10 +233,10 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%insert = insertelement <8 x i32> undef, i32 %b, i32 0
%splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -250,10 +248,10 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%insert = insertelement <16 x i32> undef, i32 %b, i32 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -279,10 +277,10 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%insert = insertelement <16 x i16> undef, i16 %b, i32 0
%splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -294,11 +292,11 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512F: Found an estimated cost of 2 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%insert = insertelement <32 x i16> undef, i16 %b, i32 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -324,10 +322,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8':
; SSE2: Found an estimated cost of 108 for instruction: %shift
; SSE41: Found an estimated cost of 48 for instruction: %shift
-; AVX: Found an estimated cost of 48 for instruction: %shift
+; AVX: Found an estimated cost of 50 for instruction: %shift
; AVX2: Found an estimated cost of 24 for instruction: %shift
; AVX512: Found an estimated cost of 24 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%insert = insertelement <32 x i8> undef, i8 %b, i32 0
%splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
%shift = ashr <32 x i8> %a, %splat
@@ -338,11 +336,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
; SSE2: Found an estimated cost of 216 for instruction: %shift
; SSE41: Found an estimated cost of 96 for instruction: %shift
-; AVX: Found an estimated cost of 96 for instruction: %shift
+; AVX: Found an estimated cost of 100 for instruction: %shift
; AVX2: Found an estimated cost of 48 for instruction: %shift
; AVX512F: Found an estimated cost of 48 for instruction: %shift
; AVX512BW: Found an estimated cost of 24 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%insert = insertelement <64 x i8> undef, i8 %b, i32 0
%splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
%shift = ashr <64 x i8> %a, %splat
@@ -369,10 +367,10 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i64':
; SSE2: Found an estimated cost of 24 for instruction: %shift
; SSE41: Found an estimated cost of 24 for instruction: %shift
-; AVX: Found an estimated cost of 24 for instruction: %shift
+; AVX: Found an estimated cost of 26 for instruction: %shift
; AVX2: Found an estimated cost of 4 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
ret <4 x i64> %shift
}
@@ -381,10 +379,10 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
; SSE2: Found an estimated cost of 48 for instruction: %shift
; SSE41: Found an estimated cost of 48 for instruction: %shift
-; AVX: Found an estimated cost of 48 for instruction: %shift
+; AVX: Found an estimated cost of 52 for instruction: %shift
; AVX2: Found an estimated cost of 8 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
ret <8 x i64> %shift
}
@@ -406,10 +404,10 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32':
; SSE2: Found an estimated cost of 32 for instruction: %shift
; SSE41: Found an estimated cost of 24 for instruction: %shift
-; AVX: Found an estimated cost of 24 for instruction: %shift
+; AVX: Found an estimated cost of 26 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <8 x i32> %shift
@@ -419,10 +417,10 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
; SSE2: Found an estimated cost of 64 for instruction: %shift
; SSE41: Found an estimated cost of 48 for instruction: %shift
-; AVX: Found an estimated cost of 48 for instruction: %shift
+; AVX: Found an estimated cost of 52 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <16 x i32> %shift
@@ -445,11 +443,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i16':
; SSE2: Found an estimated cost of 64 for instruction: %shift
; SSE41: Found an estimated cost of 28 for instruction: %shift
-; AVX: Found an estimated cost of 28 for instruction: %shift
+; AVX: Found an estimated cost of 30 for instruction: %shift
; AVX2: Found an estimated cost of 10 for instruction: %shift
; AVX512F: Found an estimated cost of 10 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <16 x i16> %shift
}
@@ -458,11 +456,11 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16':
; SSE2: Found an estimated cost of 128 for instruction: %shift
; SSE41: Found an estimated cost of 56 for instruction: %shift
-; AVX: Found an estimated cost of 56 for instruction: %shift
+; AVX: Found an estimated cost of 60 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <32 x i16> %shift
}
@@ -483,10 +481,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8':
; SSE2: Found an estimated cost of 108 for instruction: %shift
; SSE41: Found an estimated cost of 48 for instruction: %shift
-; AVX: Found an estimated cost of 48 for instruction: %shift
+; AVX: Found an estimated cost of 50 for instruction: %shift
; AVX2: Found an estimated cost of 24 for instruction: %shift
; AVX512: Found an estimated cost of 24 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <32 x i8> %shift
}
@@ -495,11 +493,11 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
; SSE2: Found an estimated cost of 216 for instruction: %shift
; SSE41: Found an estimated cost of 96 for instruction: %shift
-; AVX: Found an estimated cost of 96 for instruction: %shift
+; AVX: Found an estimated cost of 100 for instruction: %shift
; AVX2: Found an estimated cost of 48 for instruction: %shift
; AVX512F: Found an estimated cost of 48 for instruction: %shift
; AVX512BW: Found an estimated cost of 24 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <64 x i8> %shift
}
@@ -524,10 +522,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i64':
; SSE2: Found an estimated cost of 8 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
-; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 10 for instruction: %shift
; AVX2: Found an estimated cost of 4 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 4 for instruction: %shift
%shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %shift
}
@@ -536,10 +535,11 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
; SSE2: Found an estimated cost of 16 for instruction: %shift
; SSE41: Found an estimated cost of 16 for instruction: %shift
-; AVX: Found an estimated cost of 16 for instruction: %shift
+; AVX: Found an estimated cost of 20 for instruction: %shift
; AVX2: Found an estimated cost of 8 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 8 for instruction: %shift
%shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
ret <8 x i64> %shift
}
@@ -560,10 +560,10 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i32':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %shift
@@ -573,10 +573,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <16 x i32> %shift
@@ -598,10 +598,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i16':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
@@ -611,11 +611,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512F: Found an estimated cost of 2 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <32 x i16> %shift
@@ -628,7 +628,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 4 for instruction: %shift
; AVX512: Found an estimated cost of 4 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 2 for instruction: %shift
%shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
}
@@ -637,10 +637,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
; SSE2: Found an estimated cost of 8 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
-; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 10 for instruction: %shift
; AVX2: Found an estimated cost of 4 for instruction: %shift
; AVX512: Found an estimated cost of 4 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 4 for instruction: %shift
%shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
@@ -650,11 +650,11 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
; SSE2: Found an estimated cost of 16 for instruction: %shift
; SSE41: Found an estimated cost of 16 for instruction: %shift
-; AVX: Found an estimated cost of 16 for instruction: %shift
+; AVX: Found an estimated cost of 20 for instruction: %shift
; AVX2: Found an estimated cost of 8 for instruction: %shift
; AVX512F: Found an estimated cost of 8 for instruction: %shift
; AVX512BW: Found an estimated cost of 4 for instruction: %shift
-; XOPAVX: Found an estimated cost of 16 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 8 for instruction: %shift
%shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <64 x i8> %shift
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index 546b2bb50f26..6e890369d677 100644
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -34,10 +34,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i64':
; SSE2: Found an estimated cost of 8 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
-; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 10 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <4 x i64> %a, %b
ret <4 x i64> %shift
@@ -47,10 +47,10 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
; SSE2: Found an estimated cost of 16 for instruction: %shift
; SSE41: Found an estimated cost of 16 for instruction: %shift
-; AVX: Found an estimated cost of 16 for instruction: %shift
+; AVX: Found an estimated cost of 20 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <8 x i64> %a, %b
ret <8 x i64> %shift
@@ -73,10 +73,10 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
; SSE2: Found an estimated cost of 32 for instruction: %shift
; SSE41: Found an estimated cost of 22 for instruction: %shift
-; AVX: Found an estimated cost of 22 for instruction: %shift
+; AVX: Found an estimated cost of 24 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <8 x i32> %a, %b
ret <8 x i32> %shift
@@ -86,10 +86,10 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
; SSE2: Found an estimated cost of 64 for instruction: %shift
; SSE41: Found an estimated cost of 44 for instruction: %shift
-; AVX: Found an estimated cost of 44 for instruction: %shift
+; AVX: Found an estimated cost of 48 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <16 x i32> %a, %b
ret <16 x i32> %shift
@@ -112,11 +112,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16':
; SSE2: Found an estimated cost of 64 for instruction: %shift
; SSE41: Found an estimated cost of 28 for instruction: %shift
-; AVX: Found an estimated cost of 28 for instruction: %shift
+; AVX: Found an estimated cost of 30 for instruction: %shift
; AVX2: Found an estimated cost of 10 for instruction: %shift
; AVX512F: Found an estimated cost of 10 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%shift = lshr <16 x i16> %a, %b
ret <16 x i16> %shift
}
@@ -125,11 +125,11 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
; SSE2: Found an estimated cost of 128 for instruction: %shift
; SSE41: Found an estimated cost of 56 for instruction: %shift
-; AVX: Found an estimated cost of 56 for instruction: %shift
+; AVX: Found an estimated cost of 60 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%shift = lshr <32 x i16> %a, %b
ret <32 x i16> %shift
}
@@ -150,10 +150,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8':
; SSE2: Found an estimated cost of 52 for instruction: %shift
; SSE41: Found an estimated cost of 24 for instruction: %shift
-; AVX: Found an estimated cost of 24 for instruction: %shift
+; AVX: Found an estimated cost of 26 for instruction: %shift
; AVX2: Found an estimated cost of 11 for instruction: %shift
; AVX512: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%shift = lshr <32 x i8> %a, %b
ret <32 x i8> %shift
}
@@ -162,11 +162,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
; SSE2: Found an estimated cost of 104 for instruction: %shift
; SSE41: Found an estimated cost of 48 for instruction: %shift
-; AVX: Found an estimated cost of 48 for instruction: %shift
+; AVX: Found an estimated cost of 52 for instruction: %shift
; AVX2: Found an estimated cost of 22 for instruction: %shift
; AVX512F: Found an estimated cost of 22 for instruction: %shift
; AVX512BW: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%shift = lshr <64 x i8> %a, %b
ret <64 x i8> %shift
}
@@ -193,10 +193,10 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i64':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%insert = insertelement <4 x i64> undef, i64 %b, i32 0
%splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -208,10 +208,10 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%insert = insertelement <8 x i64> undef, i64 %b, i32 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -237,10 +237,10 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%insert = insertelement <8 x i32> undef, i32 %b, i32 0
%splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -252,10 +252,10 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%insert = insertelement <16 x i32> undef, i32 %b, i32 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -281,10 +281,10 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%insert = insertelement <16 x i16> undef, i16 %b, i32 0
%splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -296,11 +296,11 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512F: Found an estimated cost of 2 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%insert = insertelement <32 x i16> undef, i16 %b, i32 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -326,10 +326,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8':
; SSE2: Found an estimated cost of 52 for instruction: %shift
; SSE41: Found an estimated cost of 24 for instruction: %shift
-; AVX: Found an estimated cost of 24 for instruction: %shift
+; AVX: Found an estimated cost of 26 for instruction: %shift
; AVX2: Found an estimated cost of 11 for instruction: %shift
; AVX512: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%insert = insertelement <32 x i8> undef, i8 %b, i32 0
%splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
%shift = lshr <32 x i8> %a, %splat
@@ -340,11 +340,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
; SSE2: Found an estimated cost of 104 for instruction: %shift
; SSE41: Found an estimated cost of 48 for instruction: %shift
-; AVX: Found an estimated cost of 48 for instruction: %shift
+; AVX: Found an estimated cost of 52 for instruction: %shift
; AVX2: Found an estimated cost of 22 for instruction: %shift
; AVX512F: Found an estimated cost of 22 for instruction: %shift
; AVX512BW: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%insert = insertelement <64 x i8> undef, i8 %b, i32 0
%splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
%shift = lshr <64 x i8> %a, %splat
@@ -372,10 +372,10 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i64':
; SSE2: Found an estimated cost of 8 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
-; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 10 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
ret <4 x i64> %shift
@@ -385,10 +385,10 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
; SSE2: Found an estimated cost of 16 for instruction: %shift
; SSE41: Found an estimated cost of 16 for instruction: %shift
-; AVX: Found an estimated cost of 16 for instruction: %shift
+; AVX: Found an estimated cost of 20 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
ret <8 x i64> %shift
@@ -411,10 +411,10 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32':
; SSE2: Found an estimated cost of 32 for instruction: %shift
; SSE41: Found an estimated cost of 22 for instruction: %shift
-; AVX: Found an estimated cost of 22 for instruction: %shift
+; AVX: Found an estimated cost of 24 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <8 x i32> %shift
@@ -424,10 +424,10 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
; SSE2: Found an estimated cost of 64 for instruction: %shift
; SSE41: Found an estimated cost of 44 for instruction: %shift
-; AVX: Found an estimated cost of 44 for instruction: %shift
+; AVX: Found an estimated cost of 48 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <16 x i32> %shift
@@ -450,11 +450,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i16':
; SSE2: Found an estimated cost of 64 for instruction: %shift
; SSE41: Found an estimated cost of 28 for instruction: %shift
-; AVX: Found an estimated cost of 28 for instruction: %shift
+; AVX: Found an estimated cost of 30 for instruction: %shift
; AVX2: Found an estimated cost of 10 for instruction: %shift
; AVX512F: Found an estimated cost of 10 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <16 x i16> %shift
}
@@ -463,11 +463,11 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16':
; SSE2: Found an estimated cost of 128 for instruction: %shift
; SSE41: Found an estimated cost of 56 for instruction: %shift
-; AVX: Found an estimated cost of 56 for instruction: %shift
+; AVX: Found an estimated cost of 60 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <32 x i16> %shift
}
@@ -488,10 +488,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8':
; SSE2: Found an estimated cost of 52 for instruction: %shift
; SSE41: Found an estimated cost of 24 for instruction: %shift
-; AVX: Found an estimated cost of 24 for instruction: %shift
+; AVX: Found an estimated cost of 26 for instruction: %shift
; AVX2: Found an estimated cost of 11 for instruction: %shift
; AVX512: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 6 for instruction: %shift
%shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <32 x i8> %shift
}
@@ -500,11 +500,11 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
; SSE2: Found an estimated cost of 104 for instruction: %shift
; SSE41: Found an estimated cost of 48 for instruction: %shift
-; AVX: Found an estimated cost of 48 for instruction: %shift
+; AVX: Found an estimated cost of 52 for instruction: %shift
; AVX2: Found an estimated cost of 22 for instruction: %shift
; AVX512F: Found an estimated cost of 22 for instruction: %shift
; AVX512BW: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOP: Found an estimated cost of 12 for instruction: %shift
%shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <64 x i8> %shift
}
@@ -529,10 +529,10 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i64':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %shift
@@ -542,10 +542,10 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
ret <8 x i64> %shift
@@ -567,10 +567,10 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i32':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %shift
@@ -580,10 +580,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <16 x i32> %shift
@@ -605,10 +605,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i16':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
@@ -618,11 +618,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512F: Found an estimated cost of 2 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <32 x i16> %shift
@@ -644,10 +644,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 6 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 2 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 6 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
@@ -657,11 +657,11 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
; SSE2: Found an estimated cost of 8 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
-; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 12 for instruction: %shift
; AVX2: Found an estimated cost of 4 for instruction: %shift
; AVX512F: Found an estimated cost of 4 for instruction: %shift
; AVX512BW: Found an estimated cost of 2 for instruction: %shift
-; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 12 for instruction: %shift
; XOPAVX2: Found an estimated cost of 4 for instruction: %shift
%shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <64 x i8> %shift
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index 90356f5ce8be..5e604bb7983e 100644
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -35,10 +35,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i64':
; SSE2: Found an estimated cost of 8 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
-; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 10 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = shl <4 x i64> %a, %b
ret <4 x i64> %shift
@@ -48,10 +48,10 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64':
; SSE2: Found an estimated cost of 16 for instruction: %shift
; SSE41: Found an estimated cost of 16 for instruction: %shift
-; AVX: Found an estimated cost of 16 for instruction: %shift
+; AVX: Found an estimated cost of 20 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <8 x i64> %a, %b
ret <8 x i64> %shift
@@ -74,10 +74,10 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
; SSE2: Found an estimated cost of 20 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
-; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 10 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = shl <8 x i32> %a, %b
ret <8 x i32> %shift
@@ -87,10 +87,10 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
; SSE2: Found an estimated cost of 40 for instruction: %shift
; SSE41: Found an estimated cost of 16 for instruction: %shift
-; AVX: Found an estimated cost of 16 for instruction: %shift
+; AVX: Found an estimated cost of 20 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <16 x i32> %a, %b
ret <16 x i32> %shift
@@ -113,11 +113,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16':
; SSE2: Found an estimated cost of 64 for instruction: %shift
; SSE41: Found an estimated cost of 28 for instruction: %shift
-; AVX: Found an estimated cost of 28 for instruction: %shift
+; AVX: Found an estimated cost of 30 for instruction: %shift
; AVX2: Found an estimated cost of 10 for instruction: %shift
; AVX512F: Found an estimated cost of 10 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 2 for instruction: %shift
+; XOP: Found an estimated cost of 4 for instruction: %shift
%shift = shl <16 x i16> %a, %b
ret <16 x i16> %shift
}
@@ -126,11 +126,11 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16':
; SSE2: Found an estimated cost of 128 for instruction: %shift
; SSE41: Found an estimated cost of 56 for instruction: %shift
-; AVX: Found an estimated cost of 56 for instruction: %shift
+; AVX: Found an estimated cost of 60 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 8 for instruction: %shift
%shift = shl <32 x i16> %a, %b
ret <32 x i16> %shift
}
@@ -151,10 +151,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8':
; SSE2: Found an estimated cost of 52 for instruction: %shift
; SSE41: Found an estimated cost of 22 for instruction: %shift
-; AVX: Found an estimated cost of 22 for instruction: %shift
+; AVX: Found an estimated cost of 24 for instruction: %shift
; AVX2: Found an estimated cost of 11 for instruction: %shift
; AVX512: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 2 for instruction: %shift
+; XOP: Found an estimated cost of 4 for instruction: %shift
%shift = shl <32 x i8> %a, %b
ret <32 x i8> %shift
}
@@ -163,11 +163,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8':
; SSE2: Found an estimated cost of 104 for instruction: %shift
; SSE41: Found an estimated cost of 44 for instruction: %shift
-; AVX: Found an estimated cost of 44 for instruction: %shift
+; AVX: Found an estimated cost of 48 for instruction: %shift
; AVX2: Found an estimated cost of 22 for instruction: %shift
; AVX512F: Found an estimated cost of 22 for instruction: %shift
; AVX512BW: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 8 for instruction: %shift
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
}
@@ -194,10 +194,10 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i64':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%insert = insertelement <4 x i64> undef, i64 %b, i32 0
%splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -209,10 +209,10 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%insert = insertelement <8 x i64> undef, i64 %b, i32 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -238,10 +238,10 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%insert = insertelement <8 x i32> undef, i32 %b, i32 0
%splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -253,10 +253,10 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%insert = insertelement <16 x i32> undef, i32 %b, i32 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -282,10 +282,10 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%insert = insertelement <16 x i16> undef, i16 %b, i32 0
%splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -297,11 +297,11 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512F: Found an estimated cost of 2 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%insert = insertelement <32 x i16> undef, i16 %b, i32 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -327,10 +327,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8':
; SSE2: Found an estimated cost of 52 for instruction: %shift
; SSE41: Found an estimated cost of 22 for instruction: %shift
-; AVX: Found an estimated cost of 22 for instruction: %shift
+; AVX: Found an estimated cost of 24 for instruction: %shift
; AVX2: Found an estimated cost of 11 for instruction: %shift
; AVX512: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 2 for instruction: %shift
+; XOP: Found an estimated cost of 4 for instruction: %shift
%insert = insertelement <32 x i8> undef, i8 %b, i32 0
%splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
%shift = shl <32 x i8> %a, %splat
@@ -341,11 +341,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8':
; SSE2: Found an estimated cost of 104 for instruction: %shift
; SSE41: Found an estimated cost of 44 for instruction: %shift
-; AVX: Found an estimated cost of 44 for instruction: %shift
+; AVX: Found an estimated cost of 48 for instruction: %shift
; AVX2: Found an estimated cost of 22 for instruction: %shift
; AVX512F: Found an estimated cost of 22 for instruction: %shift
; AVX512BW: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 8 for instruction: %shift
%insert = insertelement <64 x i8> undef, i8 %b, i32 0
%splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
%shift = shl <64 x i8> %a, %splat
@@ -373,10 +373,10 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i64':
; SSE2: Found an estimated cost of 8 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
-; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 10 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = shl <4 x i64> %a, <i64 1, i64 7, i64 15, i64 31>
ret <4 x i64> %shift
@@ -386,10 +386,10 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64':
; SSE2: Found an estimated cost of 16 for instruction: %shift
; SSE41: Found an estimated cost of 16 for instruction: %shift
-; AVX: Found an estimated cost of 16 for instruction: %shift
+; AVX: Found an estimated cost of 20 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <8 x i64> %a, <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>
ret <8 x i64> %shift
@@ -415,7 +415,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <8 x i32> %shift
@@ -428,7 +428,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <16 x i32> %shift
@@ -453,7 +453,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <16 x i16> %shift
@@ -467,7 +467,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512F: Found an estimated cost of 2 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <32 x i16> %shift
@@ -489,10 +489,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8':
; SSE2: Found an estimated cost of 52 for instruction: %shift
; SSE41: Found an estimated cost of 22 for instruction: %shift
-; AVX: Found an estimated cost of 22 for instruction: %shift
+; AVX: Found an estimated cost of 24 for instruction: %shift
; AVX2: Found an estimated cost of 11 for instruction: %shift
; AVX512: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 2 for instruction: %shift
+; XOP: Found an estimated cost of 4 for instruction: %shift
%shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <32 x i8> %shift
}
@@ -501,11 +501,11 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8':
; SSE2: Found an estimated cost of 104 for instruction: %shift
; SSE41: Found an estimated cost of 44 for instruction: %shift
-; AVX: Found an estimated cost of 44 for instruction: %shift
+; AVX: Found an estimated cost of 48 for instruction: %shift
; AVX2: Found an estimated cost of 22 for instruction: %shift
; AVX512F: Found an estimated cost of 22 for instruction: %shift
; AVX512BW: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOP: Found an estimated cost of 8 for instruction: %shift
%shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <64 x i8> %shift
}
@@ -531,10 +531,10 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i64':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %shift
@@ -544,10 +544,10 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
ret <8 x i64> %shift
@@ -570,10 +570,10 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i32':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %shift
@@ -583,10 +583,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <16 x i32> %shift
@@ -608,10 +608,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i16':
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
@@ -621,11 +621,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512F: Found an estimated cost of 2 for instruction: %shift
; AVX512BW: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <32 x i16> %shift
@@ -638,7 +638,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
; AVX: Found an estimated cost of 2 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 2 for instruction: %shift
-; XOP: Found an estimated cost of 2 for instruction: %shift
+; XOP: Found an estimated cost of 1 for instruction: %shift
%shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
}
@@ -647,7 +647,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 6 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 2 for instruction: %shift
; XOPAVX: Found an estimated cost of 4 for instruction: %shift
@@ -660,7 +660,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
; SSE2: Found an estimated cost of 8 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
-; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 12 for instruction: %shift
; AVX2: Found an estimated cost of 4 for instruction: %shift
; AVX512F: Found an estimated cost of 4 for instruction: %shift
; AVX512BW: Found an estimated cost of 2 for instruction: %shift
@@ -761,7 +761,7 @@ define <16 x i16> @test6(<16 x i16> %a) {
; SSE41: Found an estimated cost of 2 for instruction: %shl
; AVX: Found an estimated cost of 4 for instruction: %shl
; AVX2: Found an estimated cost of 1 for instruction: %shl
-; XOPAVX: Found an estimated cost of 2 for instruction: %shl
+; XOPAVX: Found an estimated cost of 4 for instruction: %shl
; XOPAVX2: Found an estimated cost of 1 for instruction: %shl
@@ -778,7 +778,7 @@ define <8 x i32> @test7(<8 x i32> %a) {
; SSE41: Found an estimated cost of 2 for instruction: %shl
; AVX: Found an estimated cost of 4 for instruction: %shl
; AVX2: Found an estimated cost of 1 for instruction: %shl
-; XOPAVX: Found an estimated cost of 2 for instruction: %shl
+; XOPAVX: Found an estimated cost of 4 for instruction: %shl
; XOPAVX2: Found an estimated cost of 1 for instruction: %shl
@@ -794,9 +794,9 @@ define <4 x i64> @test8(<4 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'test8':
; SSE2: Found an estimated cost of 8 for instruction: %shl
; SSE41: Found an estimated cost of 8 for instruction: %shl
-; AVX: Found an estimated cost of 8 for instruction: %shl
+; AVX: Found an estimated cost of 10 for instruction: %shl
; AVX2: Found an estimated cost of 1 for instruction: %shl
-; XOPAVX: Found an estimated cost of 2 for instruction: %shl
+; XOPAVX: Found an estimated cost of 4 for instruction: %shl
; XOPAVX2: Found an estimated cost of 1 for instruction: %shl
@@ -811,7 +811,7 @@ define <32 x i16> @test9(<32 x i16> %a) {
; SSE41: Found an estimated cost of 4 for instruction: %shl
; AVX: Found an estimated cost of 8 for instruction: %shl
; AVX2: Found an estimated cost of 2 for instruction: %shl
-; XOPAVX: Found an estimated cost of 4 for instruction: %shl
+; XOPAVX: Found an estimated cost of 8 for instruction: %shl
; XOPAVX2: Found an estimated cost of 2 for instruction: %shl
@@ -826,7 +826,7 @@ define <16 x i32> @test10(<16 x i32> %a) {
; SSE41: Found an estimated cost of 4 for instruction: %shl
; AVX: Found an estimated cost of 8 for instruction: %shl
; AVX2: Found an estimated cost of 2 for instruction: %shl
-; XOPAVX: Found an estimated cost of 4 for instruction: %shl
+; XOPAVX: Found an estimated cost of 8 for instruction: %shl
; XOPAVX2: Found an estimated cost of 2 for instruction: %shl
@@ -842,7 +842,7 @@ define <8 x i64> @test11(<8 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'test11':
; SSE2: Found an estimated cost of 16 for instruction: %shl
; SSE41: Found an estimated cost of 16 for instruction: %shl
-; AVX: Found an estimated cost of 16 for instruction: %shl
+; AVX: Found an estimated cost of 20 for instruction: %shl
; AVX2: Found an estimated cost of 2 for instruction: %shl
-; XOPAVX: Found an estimated cost of 4 for instruction: %shl
+; XOPAVX: Found an estimated cost of 8 for instruction: %shl
; XOPAVX2: Found an estimated cost of 2 for instruction: %shl
diff --git a/test/Analysis/ScalarEvolution/different-loops-recs.ll b/test/Analysis/ScalarEvolution/different-loops-recs.ll
new file mode 100644
index 000000000000..ad3d1e0bd110
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/different-loops-recs.ll
@@ -0,0 +1,454 @@
+; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s
+
+; This test set ensures that we can correctly operate with recurrencies from
+; different loops.
+
+; Check that we can evaluate a sum of phis from two different loops in any
+; order.
+
+define void @test_00() {
+
+; CHECK-LABEL: Classifying expressions for: @test_00
+; CHECK: %sum1 = add i32 %phi1, %phi2
+; CHECK-NEXT: --> {14,+,3}<%loop1>
+; CHECK: %sum2 = add i32 %sum1, %phi3
+; CHECK-NEXT: --> {20,+,6}<%loop1>
+; CHECK: %sum3 = add i32 %phi4, %phi5
+; CHECK-NEXT: --> {116,+,3}<%loop2>
+; CHECK: %sum4 = add i32 %sum3, %phi6
+; CHECK-NEXT: --> {159,+,6}<%loop2>
+; CHECK: %s1 = add i32 %phi1, %phi4
+; CHECK-NEXT: --> {{{{}}73,+,1}<%loop1>,+,1}<%loop2>
+; CHECK: %s2 = add i32 %phi5, %phi2
+; CHECK-NEXT: --> {{{{}}57,+,2}<%loop1>,+,2}<%loop2>
+; CHECK: %s3 = add i32 %sum1, %sum3
+; CHECK-NEXT: --> {{{{}}130,+,3}<%loop1>,+,3}<%loop2>
+; CHECK: %s4 = add i32 %sum4, %sum2
+; CHECK-NEXT: --> {{{{}}179,+,6}<%loop1>,+,6}<%loop2>
+; CHECK: %s5 = add i32 %phi3, %sum3
+; CHECK-NEXT: --> {{{{}}122,+,3}<%loop1>,+,3}<%loop2>
+; CHECK: %s6 = add i32 %sum2, %phi6
+; CHECK-NEXT: --> {{{{}}63,+,6}<%loop1>,+,3}<%loop2>
+
+entry:
+ br label %loop1
+
+loop1:
+ %phi1 = phi i32 [ 10, %entry ], [ %phi1.inc, %loop1 ]
+ %phi2 = phi i32 [ 4, %entry ], [ %phi2.inc, %loop1 ]
+ %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ]
+ %phi1.inc = add i32 %phi1, 1
+ %phi2.inc = add i32 %phi2, 2
+ %phi3.inc = add i32 %phi3, 3
+ %sum1 = add i32 %phi1, %phi2
+ %sum2 = add i32 %sum1, %phi3
+ %cond1 = icmp ult i32 %sum2, 1000
+ br i1 %cond1, label %loop1, label %loop2
+
+loop2:
+ %phi4 = phi i32 [ 63, %loop1 ], [ %phi4.inc, %loop2 ]
+ %phi5 = phi i32 [ 53, %loop1 ], [ %phi5.inc, %loop2 ]
+ %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ]
+ %phi4.inc = add i32 %phi4, 1
+ %phi5.inc = add i32 %phi5, 2
+ %phi6.inc = add i32 %phi6, 3
+ %sum3 = add i32 %phi4, %phi5
+ %sum4 = add i32 %sum3, %phi6
+ %cond2 = icmp ult i32 %sum4, 1000
+ br i1 %cond2, label %loop2, label %exit
+
+exit:
+ %s1 = add i32 %phi1, %phi4
+ %s2 = add i32 %phi5, %phi2
+ %s3 = add i32 %sum1, %sum3
+ %s4 = add i32 %sum4, %sum2
+ %s5 = add i32 %phi3, %sum3
+ %s6 = add i32 %sum2, %phi6
+ ret void
+}
+
+; Check that we can evaluate a sum of phis+invariants from two different loops
+; in any order.
+
+define void @test_01(i32 %a, i32 %b) {
+
+; CHECK-LABEL: Classifying expressions for: @test_01
+; CHECK: %sum1 = add i32 %phi1, %phi2
+; CHECK-NEXT: --> {(%a + %b),+,3}<%loop1>
+; CHECK: %sum2 = add i32 %sum1, %phi3
+; CHECK-NEXT: --> {(6 + %a + %b),+,6}<%loop1>
+; CHECK: %is1 = add i32 %sum2, %a
+; CHECK-NEXT: --> {(6 + (2 * %a) + %b),+,6}<%loop1>
+; CHECK: %sum3 = add i32 %phi4, %phi5
+; CHECK-NEXT: --> {116,+,3}<%loop2>
+; CHECK: %sum4 = add i32 %sum3, %phi6
+; CHECK-NEXT: --> {159,+,6}<%loop2>
+; CHECK: %is2 = add i32 %sum4, %b
+; CHECK-NEXT: --> {(159 + %b),+,6}<%loop2>
+; CHECK: %ec2 = add i32 %is1, %is2
+; CHECK-NEXT: --> {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2>
+; CHECK: %s1 = add i32 %phi1, %is1
+; CHECK-NEXT: --> {(6 + (3 * %a) + %b),+,7}<%loop1>
+; CHECK: %s2 = add i32 %is2, %phi4
+; CHECK-NEXT: --> {(222 + %b),+,7}<%loop2>
+; CHECK: %s3 = add i32 %is1, %phi5
+; CHECK-NEXT: --> {{{{}}(59 + (2 * %a) + %b),+,6}<%loop1>,+,2}<%loop2>
+; CHECK: %s4 = add i32 %phi2, %is2
+; CHECK-NEXT: --> {{{{}}(159 + (2 * %b)),+,2}<%loop1>,+,6}<%loop2>
+; CHECK: %s5 = add i32 %is1, %is2
+; CHECK-NEXT: --> {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2>
+; CHECK: %s6 = add i32 %is2, %is1
+; CHECK-NEXT: --> {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2>
+
+entry:
+ br label %loop1
+
+loop1:
+ %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ]
+ %phi2 = phi i32 [ %b, %entry ], [ %phi2.inc, %loop1 ]
+ %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ]
+ %phi1.inc = add i32 %phi1, 1
+ %phi2.inc = add i32 %phi2, 2
+ %phi3.inc = add i32 %phi3, 3
+ %sum1 = add i32 %phi1, %phi2
+ %sum2 = add i32 %sum1, %phi3
+ %is1 = add i32 %sum2, %a
+ %cond1 = icmp ult i32 %is1, 1000
+ br i1 %cond1, label %loop1, label %loop2
+
+loop2:
+ %phi4 = phi i32 [ 63, %loop1 ], [ %phi4.inc, %loop2 ]
+ %phi5 = phi i32 [ 53, %loop1 ], [ %phi5.inc, %loop2 ]
+ %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ]
+ %phi4.inc = add i32 %phi4, 1
+ %phi5.inc = add i32 %phi5, 2
+ %phi6.inc = add i32 %phi6, 3
+ %sum3 = add i32 %phi4, %phi5
+ %sum4 = add i32 %sum3, %phi6
+ %is2 = add i32 %sum4, %b
+ %ec2 = add i32 %is1, %is2
+ %cond2 = icmp ult i32 %ec2, 1000
+ br i1 %cond2, label %loop2, label %exit
+
+exit:
+ %s1 = add i32 %phi1, %is1
+ %s2 = add i32 %is2, %phi4
+ %s3 = add i32 %is1, %phi5
+ %s4 = add i32 %phi2, %is2
+ %s5 = add i32 %is1, %is2
+ %s6 = add i32 %is2, %is1
+ ret void
+}
+
+; Check that we can correctly evaluate a sum of phis+variants from two different
+; loops in any order.
+
+define void @test_02(i32 %a, i32 %b, i32* %p) {
+
+; CHECK-LABEL: Classifying expressions for: @test_02
+; CHECK: %sum1 = add i32 %phi1, %phi2
+; CHECK-NEXT: --> {(%a + %b),+,3}<%loop1>
+; CHECK: %sum2 = add i32 %sum1, %phi3
+; CHECK-NEXT: --> {(6 + %a + %b),+,6}<%loop1>
+; CHECK: %is1 = add i32 %sum2, %v1
+; CHECK-NEXT: --> ({(6 + %a + %b),+,6}<%loop1> + %v1)
+; CHECK: %sum3 = add i32 %phi4, %phi5
+; CHECK-NEXT: --> {(%a + %b),+,3}<%loop2>
+; CHECK: %sum4 = add i32 %sum3, %phi6
+; CHECK-NEXT: --> {(43 + %a + %b),+,6}<%loop2>
+; CHECK: %is2 = add i32 %sum4, %v2
+; CHECK-NEXT: --> ({(43 + %a + %b),+,6}<%loop2> + %v2)
+; CHECK: %is3 = add i32 %v1, %sum2
+; CHECK-NEXT: --> ({(6 + %a + %b),+,6}<%loop1> + %v1)
+; CHECK: %ec2 = add i32 %is1, %is3
+; CHECK-NEXT: --> (2 * ({(6 + %a + %b),+,6}<%loop1> + %v1))
+; CHECK: %s1 = add i32 %phi1, %is1
+; CHECK-NEXT: --> ({(6 + (2 * %a) + %b),+,7}<%loop1> + %v1)
+; CHECK: %s2 = add i32 %is2, %phi4
+; CHECK-NEXT: --> ({(43 + (2 * %a) + %b),+,7}<%loop2> + %v2)
+; CHECK: %s3 = add i32 %is1, %phi5
+; CHECK-NEXT: --> {({(6 + (2 * %b) + %a),+,6}<%loop1> + %v1),+,2}<%loop2>
+; CHECK: %s4 = add i32 %phi2, %is2
+; CHECK-NEXT: --> ({{{{}}(43 + (2 * %b) + %a),+,2}<%loop1>,+,6}<%loop2> + %v2)
+; CHECK: %s5 = add i32 %is1, %is2
+; CHECK-NEXT: --> ({({(49 + (2 * %a) + (2 * %b)),+,6}<%loop1> + %v1),+,6}<%loop2> + %v2)
+; CHECK: %s6 = add i32 %is2, %is1
+; CHECK-NEXT: --> ({({(49 + (2 * %a) + (2 * %b)),+,6}<%loop1> + %v1),+,6}<%loop2> + %v2)
+
+entry:
+ br label %loop1
+
+loop1:
+ %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ]
+ %phi2 = phi i32 [ %b, %entry ], [ %phi2.inc, %loop1 ]
+ %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ]
+ %phi1.inc = add i32 %phi1, 1
+ %phi2.inc = add i32 %phi2, 2
+ %phi3.inc = add i32 %phi3, 3
+ %v1 = load i32, i32* %p
+ %sum1 = add i32 %phi1, %phi2
+ %sum2 = add i32 %sum1, %phi3
+ %is1 = add i32 %sum2, %v1
+ %cond1 = icmp ult i32 %is1, 1000
+ br i1 %cond1, label %loop1, label %loop2
+
+loop2:
+ %phi4 = phi i32 [ %a, %loop1 ], [ %phi4.inc, %loop2 ]
+ %phi5 = phi i32 [ %b, %loop1 ], [ %phi5.inc, %loop2 ]
+ %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ]
+ %phi4.inc = add i32 %phi4, 1
+ %phi5.inc = add i32 %phi5, 2
+ %phi6.inc = add i32 %phi6, 3
+ %v2 = load i32, i32* %p
+ %sum3 = add i32 %phi4, %phi5
+ %sum4 = add i32 %sum3, %phi6
+ %is2 = add i32 %sum4, %v2
+ %is3 = add i32 %v1, %sum2
+ %ec2 = add i32 %is1, %is3
+ %cond2 = icmp ult i32 %ec2, 1000
+ br i1 %cond2, label %loop2, label %exit
+
+exit:
+ %s1 = add i32 %phi1, %is1
+ %s2 = add i32 %is2, %phi4
+ %s3 = add i32 %is1, %phi5
+ %s4 = add i32 %phi2, %is2
+ %s5 = add i32 %is1, %is2
+ %s6 = add i32 %is2, %is1
+ ret void
+}
+
+; Mix of previous use cases that demonstrates %s3 can be incorrectly treated as
+; a recurrence of loop1 because of operands order if we pick recurrencies in an
+; incorrect order.
+
+define void @test_03(i32 %a, i32 %b, i32 %c, i32* %p) {
+
+; CHECK-LABEL: Classifying expressions for: @test_03
+; CHECK: %v1 = load i32, i32* %p
+; CHECK-NEXT: --> %v1
+; CHECK: %s1 = add i32 %phi1, %v1
+; CHECK-NEXT: --> {(%a + %v1),+,1}<%loop1>
+; CHECK: %s2 = add i32 %s1, %b
+; CHECK-NEXT: --> {(%a + %b + %v1),+,1}<%loop1>
+; CHECK: %s3 = add i32 %s2, %phi2
+; CHECK-NEXT: --> ({{{{}}((2 * %a) + %b),+,1}<%loop1>,+,2}<%loop2> + %v1)
+
+entry:
+ br label %loop1
+
+loop1:
+ %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ]
+ %phi1.inc = add i32 %phi1, 1
+ %cond1 = icmp ult i32 %phi1, %c
+ br i1 %cond1, label %loop1, label %loop2
+
+loop2:
+ %phi2 = phi i32 [ %a, %loop1 ], [ %phi2.inc, %loop2 ]
+ %phi2.inc = add i32 %phi2, 2
+ %v1 = load i32, i32* %p
+ %s1 = add i32 %phi1, %v1
+ %s2 = add i32 %s1, %b
+ %s3 = add i32 %s2, %phi2
+ %cond2 = icmp ult i32 %s3, %c
+ br i1 %cond2, label %loop2, label %exit
+
+exit:
+
+ ret void
+}
+
+; Another mix of previous use cases that demonstrates that incorrect picking of
+; a loop for a recurrence may cause a crash of SCEV analysis.
+define void @test_04() {
+
+; CHECK-LABEL: Classifying expressions for: @test_04
+; CHECK: %tmp = phi i64 [ 2, %bb ], [ %tmp4, %bb3 ]
+; CHECK-NEXT: --> {2,+,1}<nuw><nsw><%loop1>
+; CHECK: %tmp2 = trunc i64 %tmp to i32
+; CHECK-NEXT: --> {2,+,1}<%loop1>
+; CHECK: %tmp4 = add nuw nsw i64 %tmp, 1
+; CHECK-NEXT: --> {3,+,1}<nuw><%loop1>
+; CHECK: %tmp7 = phi i64 [ %tmp15, %loop2 ], [ 2, %loop1 ]
+; CHECK-NEXT: --> {2,+,1}<nuw><nsw><%loop2>
+; CHECK: %tmp10 = sub i64 %tmp9, %tmp7
+; CHECK-NEXT: --> ((sext i8 %tmp8 to i64) + {-2,+,-1}<nw><%loop2>)
+; CHECK: %tmp11 = add i64 %tmp10, undef
+; CHECK-NEXT: --> ((sext i8 %tmp8 to i64) + {(-2 + undef),+,-1}<nw><%loop2>)
+; CHECK: %tmp13 = trunc i64 %tmp11 to i32
+; CHECK-NEXT: --> ((sext i8 %tmp8 to i32) + {(trunc i64 (-2 + undef) to i32),+,-1}<%loop2>)
+; CHECK: %tmp14 = sub i32 %tmp13, %tmp2
+; CHECK-NEXT: --> ((sext i8 %tmp8 to i32) + {{{{}}(-2 + (trunc i64 (-2 + undef) to i32)),+,-1}<%loop1>,+,-1}<%loop2>)
+; CHECK: %tmp15 = add nuw nsw i64 %tmp7, 1
+; CHECK-NEXT: --> {3,+,1}<nuw><nsw><%loop2>
+
+bb:
+ br label %loop1
+
+loop1:
+ %tmp = phi i64 [ 2, %bb ], [ %tmp4, %bb3 ]
+ %tmp2 = trunc i64 %tmp to i32
+ br i1 undef, label %loop2, label %bb3
+
+bb3:
+ %tmp4 = add nuw nsw i64 %tmp, 1
+ br label %loop1
+
+bb5:
+ ret void
+
+loop2:
+ %tmp7 = phi i64 [ %tmp15, %loop2 ], [ 2, %loop1 ]
+ %tmp8 = load i8, i8 addrspace(1)* undef, align 1
+ %tmp9 = sext i8 %tmp8 to i64
+ %tmp10 = sub i64 %tmp9, %tmp7
+ %tmp11 = add i64 %tmp10, undef
+ %tmp13 = trunc i64 %tmp11 to i32
+ %tmp14 = sub i32 %tmp13, %tmp2
+ %tmp15 = add nuw nsw i64 %tmp7, 1
+ %tmp16 = icmp slt i64 %tmp15, %tmp
+ br i1 %tmp16, label %loop2, label %bb5
+}
+
+@A = weak global [1000 x i32] zeroinitializer, align 32
+
+; Demonstrate a situation when we can add two recs with different degrees from
+; the same loop.
+define void @test_05(i32 %N) {
+
+; CHECK-LABEL: Classifying expressions for: @test_05
+; CHECK: %SQ = mul i32 %i.0, %i.0
+; CHECK-NEXT: --> {4,+,5,+,2}<%bb3>
+; CHECK: %tmp4 = mul i32 %i.0, 2
+; CHECK-NEXT: --> {4,+,2}<%bb3>
+; CHECK: %tmp5 = sub i32 %SQ, %tmp4
+; CHECK-NEXT: --> {0,+,3,+,2}<%bb3>
+
+entry:
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ br label %bb3
+
+bb: ; preds = %bb3
+ %tmp = getelementptr [1000 x i32], [1000 x i32]* @A, i32 0, i32 %i.0 ; <i32*> [#uses=1]
+ store i32 123, i32* %tmp
+ %tmp2 = add i32 %i.0, 1 ; <i32> [#uses=1]
+ br label %bb3
+
+bb3: ; preds = %bb, %entry
+ %i.0 = phi i32 [ 2, %entry ], [ %tmp2, %bb ] ; <i32> [#uses=3]
+ %SQ = mul i32 %i.0, %i.0
+ %tmp4 = mul i32 %i.0, 2
+ %tmp5 = sub i32 %SQ, %tmp4
+ %tmp3 = icmp sle i32 %tmp5, 9999 ; <i1> [#uses=1]
+ br i1 %tmp3, label %bb, label %bb5
+
+bb5: ; preds = %bb3
+ br label %return
+
+return: ; preds = %bb5
+ ret void
+}
+
+; Check that we can add Phis from different loops with different nesting, nested
+; loop comes first.
+define void @test_06() {
+
+; CHECK-LABEL: Classifying expressions for: @test_06
+; CHECK: %s1 = add i32 %phi1, %phi2
+; CHECK-NEXT: --> {{{{}}30,+,1}<%loop1>,+,2}<%loop2>
+; CHECK: %s2 = add i32 %phi2, %phi1
+; CHECK-NEXT: --> {{{{}}30,+,1}<%loop1>,+,2}<%loop2>
+; CHECK: %s3 = add i32 %phi1, %phi3
+; CHECK-NEXT: --> {{{{}}40,+,1}<%loop1>,+,3}<%loop3>
+; CHECK: %s4 = add i32 %phi3, %phi1
+; CHECK-NEXT: --> {{{{}}40,+,1}<%loop1>,+,3}<%loop3>
+; CHECK: %s5 = add i32 %phi2, %phi3
+; CHECK-NEXT: --> {{{{}}50,+,2}<%loop2>,+,3}<%loop3>
+; CHECK: %s6 = add i32 %phi3, %phi2
+; CHECK-NEXT: --> {{{{}}50,+,2}<%loop2>,+,3}<%loop3>
+
+entry:
+ br label %loop1
+
+loop1:
+ %phi1 = phi i32 [ 10, %entry ], [ %phi1.inc, %loop1.exit ]
+ br label %loop2
+
+loop2:
+ %phi2 = phi i32 [ 20, %loop1 ], [ %phi2.inc, %loop2 ]
+ %phi2.inc = add i32 %phi2, 2
+ %cond2 = icmp ult i32 %phi2.inc, 1000
+ br i1 %cond2, label %loop2, label %loop1.exit
+
+loop1.exit:
+ %phi1.inc = add i32 %phi1, 1
+ %cond1 = icmp ult i32 %phi1.inc, 1000
+ br i1 %cond1, label %loop1, label %loop3
+
+loop3:
+ %phi3 = phi i32 [ 30, %loop1.exit ], [ %phi3.inc, %loop3 ]
+ %phi3.inc = add i32 %phi3, 3
+ %cond3 = icmp ult i32 %phi3.inc, 1000
+ br i1 %cond3, label %loop3, label %exit
+
+exit:
+ %s1 = add i32 %phi1, %phi2
+ %s2 = add i32 %phi2, %phi1
+ %s3 = add i32 %phi1, %phi3
+ %s4 = add i32 %phi3, %phi1
+ %s5 = add i32 %phi2, %phi3
+ %s6 = add i32 %phi3, %phi2
+ ret void
+}
+
+; Check that we can add Phis from different loops with different nesting, nested
+; loop comes second.
+define void @test_07() {
+
+; CHECK-LABEL: Classifying expressions for: @test_07
+; CHECK: %s1 = add i32 %phi1, %phi2
+; CHECK-NEXT: --> {{{{}}30,+,1}<%loop1>,+,2}<%loop2>
+; CHECK: %s2 = add i32 %phi2, %phi1
+; CHECK-NEXT: --> {{{{}}30,+,1}<%loop1>,+,2}<%loop2>
+; CHECK: %s3 = add i32 %phi1, %phi3
+; CHECK-NEXT: --> {{{{}}40,+,3}<%loop3>,+,1}<%loop1>
+; CHECK: %s4 = add i32 %phi3, %phi1
+; CHECK-NEXT: --> {{{{}}40,+,3}<%loop3>,+,1}<%loop1>
+; CHECK: %s5 = add i32 %phi2, %phi3
+; CHECK-NEXT: --> {{{{}}50,+,3}<%loop3>,+,2}<%loop2>
+; CHECK: %s6 = add i32 %phi3, %phi2
+; CHECK-NEXT: --> {{{{}}50,+,3}<%loop3>,+,2}<%loop2>
+
+entry:
+ br label %loop3
+
+loop3:
+ %phi3 = phi i32 [ 30, %entry ], [ %phi3.inc, %loop3 ]
+ %phi3.inc = add i32 %phi3, 3
+ %cond3 = icmp ult i32 %phi3.inc, 1000
+ br i1 %cond3, label %loop3, label %loop1
+
+loop1:
+ %phi1 = phi i32 [ 10, %loop3 ], [ %phi1.inc, %loop1.exit ]
+ br label %loop2
+
+loop2:
+ %phi2 = phi i32 [ 20, %loop1 ], [ %phi2.inc, %loop2 ]
+ %phi2.inc = add i32 %phi2, 2
+ %cond2 = icmp ult i32 %phi2.inc, 1000
+ br i1 %cond2, label %loop2, label %loop1.exit
+
+loop1.exit:
+ %phi1.inc = add i32 %phi1, 1
+ %cond1 = icmp ult i32 %phi1.inc, 1000
+ br i1 %cond1, label %exit, label %loop1
+
+exit:
+ %s1 = add i32 %phi1, %phi2
+ %s2 = add i32 %phi2, %phi1
+ %s3 = add i32 %phi1, %phi3
+ %s4 = add i32 %phi3, %phi1
+ %s5 = add i32 %phi2, %phi3
+ %s6 = add i32 %phi3, %phi2
+ ret void
+}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
index eab314eaa9c2..655d4558a5e1 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
@@ -5,22 +5,22 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
; TBAA should prove that these calls don't interfere, since they are
; IntrArgReadMem and have TBAA metadata.
-; CHECK: define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
+; CHECK: define <8 x i16> @test0(<8 x i16>* %p, <8 x i16>* %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[NUW:#[0-9]+]]
-; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK-NEXT: %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) [[NUW:#[0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m)
; CHECK-NEXT: %c = add <8 x i16> %a, %a
-define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
+define <8 x i16> @test0(<8 x i16>* %p, <8 x i16>* %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
entry:
- %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2
- call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16), !tbaa !1
- %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2
+ %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind, !tbaa !2
+ call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m), !tbaa !1
+ %b = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind, !tbaa !2
%c = add <8 x i16> %a, %b
ret <8 x i16> %c
}
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly
-declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) nounwind readonly
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) nounwind
; CHECK: attributes #0 = { argmemonly nounwind readonly }
; CHECK: attributes #1 = { argmemonly nounwind }
diff --git a/test/Assembler/globalvariable-attributes.ll b/test/Assembler/globalvariable-attributes.ll
new file mode 100644
index 000000000000..64227a451c25
--- /dev/null
+++ b/test/Assembler/globalvariable-attributes.ll
@@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+
+@g1 = global i32 7 "key" = "value" "key2" = "value2"
+@g2 = global i32 2, align 4 "key3" = "value3"
+@g3 = global i32 2 #0
+@g4 = global i32 2, align 4 "key5" = "value5" #0
+
+attributes #0 = { "string" = "value" nobuiltin norecurse }
+
+; CHECK: @g1 = global i32 7 #0
+; CHECK: @g2 = global i32 2, align 4 #1
+; CHECK: @g3 = global i32 2 #2
+; CHECK: @g4 = global i32 2, align 4 #3
+
+; CHECK: attributes #0 = { "key"="value" "key2"="value2" }
+; CHECK: attributes #1 = { "key3"="value3" }
+; CHECK: attributes #2 = { nobuiltin norecurse "string"="value" }
+; CHECK: attributes #3 = { nobuiltin norecurse "key5"="value5" "string"="value" }
+
diff --git a/test/Bitcode/globalvariable-attributes.ll b/test/Bitcode/globalvariable-attributes.ll
new file mode 100644
index 000000000000..cbab3b71e58a
--- /dev/null
+++ b/test/Bitcode/globalvariable-attributes.ll
@@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+@g1 = global i32 7 "key" = "value" "key2" = "value2"
+@g2 = global i32 2, align 4 "key3" = "value3"
+@g3 = global i32 2 #0
+@g4 = global i32 2, align 4 "key5" = "value5" #0
+
+attributes #0 = { "string" = "value" nobuiltin norecurse }
+
+; CHECK: @g1 = global i32 7 #0
+; CHECK: @g2 = global i32 2, align 4 #1
+; CHECK: @g3 = global i32 2 #2
+; CHECK: @g4 = global i32 2, align 4 #3
+
+; CHECK: attributes #0 = { "key"="value" "key2"="value2" }
+; CHECK: attributes #1 = { "key3"="value3" }
+; CHECK: attributes #2 = { nobuiltin norecurse "string"="value" }
+; CHECK: attributes #3 = { nobuiltin norecurse "key5"="value5" "string"="value" }
+
diff --git a/test/Bitcode/ptest-old.ll b/test/Bitcode/ptest-old.ll
index c1e1cae37368..53ffef900b57 100644
--- a/test/Bitcode/ptest-old.ll
+++ b/test/Bitcode/ptest-old.ll
@@ -1,5 +1,6 @@
; RUN: llvm-as < %s | llvm-dis | FileCheck %s
; RUN: verify-uselistorder < %s
+; REQUIRES: x86
define i32 @foo(<4 x float> %bar) nounwind {
entry:
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
index 982bb5cb7e53..b64d5bd52bfc 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
@@ -29,7 +29,7 @@
; CHECK-NEXT: <VERSION
; CHECK-NEXT: <VALUE_GUID op0=25 op1=123/>
; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123
-; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=3 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=3/>
+; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=1 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=3/>
; CHECK-NEXT: </GLOBALVAL_SUMMARY_BLOCK>
; CHECK: <STRTAB_BLOCK
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
new file mode 100644
index 000000000000..875f397646a6
--- /dev/null
+++ b/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll
@@ -0,0 +1,121 @@
+; Test to check the callgraph in summary when there is PGO
+; RUN: opt -module-summary %s -o %t.o
+; RUN: llvm-bcanalyzer -dump %t.o | FileCheck %s
+; RUN: opt -module-summary %p/Inputs/thinlto-function-summary-callgraph-profile-summary.ll -o %t2.o
+; RUN: llvm-lto -thinlto -o %t3 %t.o %t2.o
+; RUN: llvm-bcanalyzer -dump %t3.thinlto.bc | FileCheck %s --check-prefix=COMBINED
+
+
+; CHECK: <SOURCE_FILENAME
+; "hot_function"
+; CHECK-NEXT: <FUNCTION op0=0 op1=12
+; "hot1"
+; CHECK-NEXT: <FUNCTION op0=12 op1=4
+; "hot2"
+; CHECK-NEXT: <FUNCTION op0=16 op1=4
+; "hot3"
+; CHECK-NEXT: <FUNCTION op0=20 op1=4
+; "hot4"
+; CHECK-NEXT: <FUNCTION op0=24 op1=4
+; "cold"
+; CHECK-NEXT: <FUNCTION op0=28 op1=4
+; "none1"
+; CHECK-NEXT: <FUNCTION op0=32 op1=5
+; "none2"
+; CHECK-NEXT: <FUNCTION op0=37 op1=5
+; "none3"
+; CHECK-NEXT: <FUNCTION op0=42 op1=5
+; CHECK-LABEL: <GLOBALVAL_SUMMARY_BLOCK
+; CHECK-NEXT: <VERSION
+; CHECK-NEXT: <VALUE_GUID op0=25 op1=123/>
+; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123
+; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=3 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=3/>
+; CHECK-NEXT: </GLOBALVAL_SUMMARY_BLOCK>
+
+; CHECK: <STRTAB_BLOCK
+; CHECK-NEXT: blob data = 'hot_functionhot1hot2hot3hot4coldnone1none2none3'
+
+; COMBINED: <GLOBALVAL_SUMMARY_BLOCK
+; COMBINED-NEXT: <VERSION
+; COMBINED-NEXT: <VALUE_GUID
+; COMBINED-NEXT: <VALUE_GUID
+; COMBINED-NEXT: <VALUE_GUID
+; COMBINED-NEXT: <VALUE_GUID
+; COMBINED-NEXT: <VALUE_GUID
+; COMBINED-NEXT: <VALUE_GUID
+; COMBINED-NEXT: <VALUE_GUID
+; COMBINED-NEXT: <VALUE_GUID
+; COMBINED-NEXT: <COMBINED abbrevid=
+; COMBINED-NEXT: <COMBINED abbrevid=
+; COMBINED-NEXT: <COMBINED abbrevid=
+; COMBINED-NEXT: <COMBINED abbrevid=
+; COMBINED-NEXT: <COMBINED abbrevid=
+; COMBINED-NEXT: <COMBINED abbrevid=
+; COMBINED-NEXT: <COMBINED_PROFILE {{.*}} op5=[[HOT1:.*]] op6=3 op7=[[COLD:.*]] op8=1 op9=[[HOT2:.*]] op10=3 op11=[[NONE1:.*]] op12=2 op13=[[HOT3:.*]] op14=3 op15=[[NONE2:.*]] op16=2 op17=[[NONE3:.*]] op18=2/>
+; COMBINED_NEXT: <COMBINED abbrevid=
+; COMBINED_NEXT: </GLOBALVAL_SUMMARY_BLOCK>
+
+
+; ModuleID = 'thinlto-function-summary-callgraph.ll'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This function have high profile count, so entry block is hot.
+define void @hot_function(i1 %a, i1 %a2) !prof !20 {
+entry:
+ call void @hot1()
+ br i1 %a, label %Cold, label %Hot, !prof !41
+Cold: ; 1/1000 goes here
+ call void @cold()
+ call void @hot2()
+ call void @hot4(), !prof !15
+ call void @none1()
+ br label %exit
+Hot: ; 999/1000 goes here
+ call void @hot2()
+ call void @hot3()
+ br i1 %a2, label %None1, label %None2, !prof !42
+None1: ; half goes here
+ call void @none1()
+ call void @none2()
+ br label %exit
+None2: ; half goes here
+ call void @none3()
+ br label %exit
+exit:
+ ret void
+}
+
+declare void @hot1() #1
+declare void @hot2() #1
+declare void @hot3() #1
+declare void @hot4() #1
+declare void @cold() #1
+declare void @none1() #1
+declare void @none2() #1
+declare void @none3() #1
+
+
+!41 = !{!"branch_weights", i32 1, i32 1000}
+!42 = !{!"branch_weights", i32 1, i32 1}
+
+
+
+!llvm.module.flags = !{!1}
+!20 = !{!"function_entry_count", i64 110, i64 123}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"SampleProfile"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 10}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"branch_weights", i32 100}
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
index 739fdd5cb4c5..0f054f1d940c 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir
@@ -74,6 +74,21 @@
%res = bitcast <2 x i32> %vres to i64
ret i64 %res
}
+
+ define i64 @floatingPointLoad(i64 %arg1, double* %addr) {
+ %varg1 = bitcast i64 %arg1 to double
+ %varg2 = load double, double* %addr
+ %vres = fadd double %varg1, %varg2
+ %res = bitcast double %vres to i64
+ ret i64 %res
+ }
+
+ define void @floatingPointStore(i64 %arg1, double* %addr) {
+ %varg1 = bitcast i64 %arg1 to double
+ %vres = fadd double %varg1, %varg1
+ store double %vres, double* %addr
+ ret void
+ }
...
---
@@ -650,3 +665,84 @@ body: |
RET_ReallyLR implicit %x0
...
+
+---
+# Make sure we map what looks like floating point
+# loads to floating point register bank.
+# CHECK-LABEL: name: floatingPointLoad
+name: floatingPointLoad
+legalized: true
+
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: fpr }
+# CHECK-NEXT: - { id: 3, class: fpr }
+# CHECK-NEXT: - { id: 4, class: fpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+
+# No repairing should be necessary for both modes.
+# CHECK: %0(s64) = COPY %x0
+# CHECK-NEXT: %1(p0) = COPY %x1
+# CHECK-NEXT: %2(s64) = G_LOAD %1(p0) :: (load 8 from %ir.addr)
+# %0 has been mapped to GPR, we need to repair to match FPR.
+# CHECK-NEXT: %4(s64) = COPY %0
+# CHECK-NEXT: %3(s64) = G_FADD %4, %2
+# CHECK-NEXT: %x0 = COPY %3(s64)
+# CHECK-NEXT: RET_ReallyLR implicit %x0
+
+body: |
+ bb.0:
+ liveins: %x0, %x1
+
+ %0(s64) = COPY %x0
+ %1(p0) = COPY %x1
+ %2(s64) = G_LOAD %1(p0) :: (load 8 from %ir.addr)
+ %3(s64) = G_FADD %0, %2
+ %x0 = COPY %3(s64)
+ RET_ReallyLR implicit %x0
+
+...
+
+---
+# Make sure we map what looks like floating point
+# stores to floating point register bank.
+# CHECK-LABEL: name: floatingPointStore
+name: floatingPointStore
+legalized: true
+
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: fpr }
+# CHECK-NEXT: - { id: 3, class: fpr }
+# CHECK-NEXT: - { id: 4, class: fpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+
+# CHECK: %0(s64) = COPY %x0
+# CHECK-NEXT: %1(p0) = COPY %x1
+# %0 has been mapped to GPR, we need to repair to match FPR.
+# CHECK-NEXT: %3(s64) = COPY %0
+# CHECK-NEXT: %4(s64) = COPY %0
+# CHECK-NEXT: %2(s64) = G_FADD %3, %4
+# CHECK-NEXT: G_STORE %2(s64), %1(p0) :: (store 8 into %ir.addr)
+# CHECK-NEXT: RET_ReallyLR
+
+body: |
+ bb.0:
+ liveins: %x0, %x1
+
+ %0(s64) = COPY %x0
+ %1(p0) = COPY %x1
+ %2(s64) = G_FADD %0, %0
+ G_STORE %2(s64), %1(p0) :: (store 8 into %ir.addr)
+ RET_ReallyLR
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/call-translator.ll b/test/CodeGen/AArch64/GlobalISel/call-translator.ll
index f8d95c88cc8f..44705a9c9f65 100644
--- a/test/CodeGen/AArch64/GlobalISel/call-translator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/call-translator.ll
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=aarch64-linux-gnu -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
; CHECK-LABEL: name: test_trivial_call
-; CHECK: ADJCALLSTACKDOWN 0, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def %sp, implicit %sp
; CHECK: BL @trivial_callee, csr_aarch64_aapcs, implicit-def %lr
; CHECK: ADJCALLSTACKUP 0, 0, implicit-def %sp, implicit %sp
declare void @trivial_callee()
@@ -186,7 +186,7 @@ define void @test_stack_slots([8 x i64], i64 %lhs, i64 %rhs, i64* %addr) {
; CHECK: [[C42:%[0-9]+]](s64) = G_CONSTANT i64 42
; CHECK: [[C12:%[0-9]+]](s64) = G_CONSTANT i64 12
; CHECK: [[PTR:%[0-9]+]](p0) = G_CONSTANT i64 0
-; CHECK: ADJCALLSTACKDOWN 24, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 24, 0, implicit-def %sp, implicit %sp
; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
; CHECK: [[C42_OFFS:%[0-9]+]](s64) = G_CONSTANT i64 0
; CHECK: [[C42_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[C42_OFFS]](s64)
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
index 2682fa7dcce1..fc1aeb7b37d9 100644
--- a/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -378,11 +378,11 @@ define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
; CHECK-NEXT: cmp x0, #13
; CHECK-NOT: ccmp
; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt
+; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]]
; CHECK-NEXT: cmp x2, #2
; CHECK-NEXT: cset [[REG2:w[0-9]+]], lt
; CHECK-NEXT: cmp x2, #4
; CHECK-NEXT: cset [[REG3:w[0-9]+]], gt
-; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]]
; CHECK-NEXT: and [[REG5:w[0-9]+]], [[REG2]], [[REG3]]
; CHECK-NEXT: orr [[REG6:w[0-9]+]], [[REG4]], [[REG5]]
; CHECK-NEXT: cmp [[REG6]], #0
diff --git a/test/CodeGen/AArch64/arm64-fml-combines.ll b/test/CodeGen/AArch64/arm64-fml-combines.ll
index 840d1dcbf060..f97498825279 100644
--- a/test/CodeGen/AArch64/arm64-fml-combines.ll
+++ b/test/CodeGen/AArch64/arm64-fml-combines.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -fp-contract=fast | FileCheck %s
+
define void @foo_2d(double* %src) {
entry:
%arrayidx1 = getelementptr inbounds double, double* %src, i64 5
@@ -126,3 +128,23 @@ for.body: ; preds = %for.body, %entry
for.end: ; preds = %for.body
ret void
}
+
+; CHECK-LABEL: test1:
+; CHECK: fnmadd s0, s0, s1, s2
+define float @test1(float %a, float %b, float %c) {
+entry:
+ %0 = fmul float %a, %b
+ %mul = fsub float -0.000000e+00, %0
+ %sub1 = fsub float %mul, %c
+ ret float %sub1
+}
+
+; CHECK-LABEL: test2:
+; CHECK: fnmadd d0, d0, d1, d2
+define double @test2(double %a, double %b, double %c) {
+entry:
+ %0 = fmul double %a, %b
+ %mul = fsub double -0.000000e+00, %0
+ %sub1 = fsub double %mul, %c
+ ret double %sub1
+}
diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll
index caaf8615cd4a..a8d1c2482520 100644
--- a/test/CodeGen/AArch64/arm64-hello.ll
+++ b/test/CodeGen/AArch64/arm64-hello.ll
@@ -6,8 +6,8 @@
; CHECK-NEXT: stp x29, x30, [sp, #16]
; CHECK-NEXT: add x29, sp, #16
; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK: adrp x0, L_.str@PAGE
-; CHECK: add x0, x0, L_.str@PAGEOFF
+; CHECK: adrp x0, l_.str@PAGE
+; CHECK: add x0, x0, l_.str@PAGEOFF
; CHECK-NEXT: bl _puts
; CHECK-NEXT: ldp x29, x30, [sp, #16]
; CHECK-NEXT: add sp, sp, #32
diff --git a/test/CodeGen/AArch64/arm64-misched-multimmo.ll b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
index 3593668e0156..4c0195b93a44 100644
--- a/test/CodeGen/AArch64/arm64-misched-multimmo.ll
+++ b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
@@ -12,7 +12,7 @@
; CHECK: Successors:
; CHECK-NOT: ch SU(4)
; CHECK: SU(3)
-; CHECK: SU(4): STRWui %WZR, %X{{[0-9]+}}
+; CHECK: SU(5): STRWui %WZR, %X{{[0-9]+}}
define i32 @foo() {
entry:
%0 = load i32, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @G2, i64 0, i64 0), align 4
diff --git a/test/CodeGen/AArch64/macho-global-symbols.ll b/test/CodeGen/AArch64/macho-global-symbols.ll
new file mode 100644
index 000000000000..d68abad57ccd
--- /dev/null
+++ b/test/CodeGen/AArch64/macho-global-symbols.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=arm64-apple-ios %s -o - | FileCheck %s
+
+; All global symbols must be at-most linker-private for AArch64 because we don't
+; use section-relative relocations in MachO.
+
+define i8* @private_sym() {
+; CHECK-LABEL: private_sym:
+; CHECK: adrp [[HIBITS:x[0-9]+]], l_var@PAGE
+; CHECK: add x0, [[HIBITS]], l_var@PAGEOFF
+
+ ret i8* getelementptr([2 x i8], [2 x i8]* @var, i32 0, i32 0)
+}
+
+; CHECK: .section __TEXT,__cstring
+; CHECK: l_var:
+; CHECK: .asciz "\002"
+@var = private unnamed_addr constant [2 x i8] [i8 2, i8 0]
diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll
index f29dfb3a9802..4c682e594e66 100644
--- a/test/CodeGen/AArch64/misched-fusion-aes.ll
+++ b/test/CodeGen/AArch64/misched-fusion-aes.ll
@@ -1,4 +1,5 @@
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA72
; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1
declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k)
@@ -87,6 +88,22 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
; CHECKA57: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+; CHECKA72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKA72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
+; CHECKA72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
+; CHECKA72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKA72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
+; CHECKA72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
+; CHECKA72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
+; CHECKA72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1: aesmc {{v[0-7].16b}}, [[VA]]
; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
@@ -187,6 +204,22 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
; CHECKA57: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+; CHECKA72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKA72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
+; CHECKA72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
+; CHECKA72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKA72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
+; CHECKA72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
+; CHECKA72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
+; CHECKA72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
; CHECKM1: aesimc {{v[0-7].16b}}, [[VA]]
; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
diff --git a/test/CodeGen/AArch64/stackmap-frame-setup.ll b/test/CodeGen/AArch64/stackmap-frame-setup.ll
index 5646703fa403..677ff8dc2530 100644
--- a/test/CodeGen/AArch64/stackmap-frame-setup.ll
+++ b/test/CodeGen/AArch64/stackmap-frame-setup.ll
@@ -7,11 +7,11 @@ entry:
store i64 11, i64* %metadata
store i64 12, i64* %metadata
store i64 13, i64* %metadata
-; ISEL: ADJCALLSTACKDOWN 0, implicit-def
+; ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def
; ISEL-NEXT: STACKMAP
; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
-; FAST-ISEL: ADJCALLSTACKDOWN 0, implicit-def
+; FAST-ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def
; FAST-ISEL-NEXT: STACKMAP
; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
ret void
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index 56a9e7022db9..2a3d3887ed69 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -14,7 +14,7 @@ regBankSelected: true
# GCN: global_addrspace
# GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1
-# GCN: FLAT_LOAD_DWORD [[PTR]], 0, 0, 0
+# GCN: FLAT_LOAD_DWORD [[PTR]], 0, 0
body: |
bb.0:
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index ea435725bf25..89be3bde94a8 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -15,7 +15,7 @@ regBankSelected: true
# GCN: global_addrspace
# GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1
# GCN: [[VAL:%[0-9]+]] = COPY %vgpr2
-# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0, 0
+# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0
body: |
bb.0:
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
new file mode 100644
index 000000000000..8839ba8e0ab2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
@@ -0,0 +1,20 @@
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+--- |
+ define void @test_constant() {
+ entry:
+ ret void
+ }
+...
+
+---
+name: test_constant
+registers:
+ - { id: 0, class: _ }
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: test_constant
+ ; CHECK: %0(s32) = G_CONSTANT i32 5
+
+ %0(s32) = G_CONSTANT i32 5
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg b/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg
new file mode 100644
index 000000000000..e99d1bb8446c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'global-isel' in config.root.available_features:
+ config.unsupported = True
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index 62b47beb1251..bc992ed77ffd 100644
--- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -219,19 +219,19 @@ body: |
%34 = V_MOV_B32_e32 63, implicit %exec
%27 = V_AND_B32_e64 %26, %24, implicit %exec
- FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %27, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%28 = V_AND_B32_e64 %24, %26, implicit %exec
- FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%29 = V_AND_B32_e32 %26, %24, implicit %exec
- FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %29, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%30 = V_AND_B32_e64 %26, %26, implicit %exec
- FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %30, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%31 = V_AND_B32_e64 %34, %34, implicit %exec
- FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %37, %31, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
S_ENDPGM
@@ -407,34 +407,34 @@ body: |
%27 = S_MOV_B32 -4
%11 = V_LSHLREV_B32_e64 12, %10, implicit %exec
- FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec
- FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%13 = V_LSHL_B32_e64 %7, 12, implicit %exec
- FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%14 = V_LSHL_B32_e64 12, %7, implicit %exec
- FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%15 = V_LSHL_B32_e64 12, %24, implicit %exec
- FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%22 = V_LSHL_B32_e64 %6, 12, implicit %exec
- FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%23 = V_LSHL_B32_e64 %6, 32, implicit %exec
- FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%25 = V_LSHL_B32_e32 %6, %6, implicit %exec
- FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%26 = V_LSHLREV_B32_e32 11, %24, implicit %exec
- FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%28 = V_LSHL_B32_e32 %27, %6, implicit %exec
- FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
S_ENDPGM
@@ -615,34 +615,34 @@ body: |
%35 = V_MOV_B32_e32 2, implicit %exec
%11 = V_ASHRREV_I32_e64 8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%13 = V_ASHR_I32_e64 %7, 3, implicit %exec
- FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%14 = V_ASHR_I32_e64 7, %32, implicit %exec
- FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%15 = V_ASHR_I32_e64 %27, %24, implicit %exec
- FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%22 = V_ASHR_I32_e64 %6, 4, implicit %exec
- FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%23 = V_ASHR_I32_e64 %6, %33, implicit %exec
- FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%25 = V_ASHR_I32_e32 %34, %34, implicit %exec
- FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%26 = V_ASHRREV_I32_e32 11, %10, implicit %exec
- FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%28 = V_ASHR_I32_e32 %27, %35, implicit %exec
- FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
S_ENDPGM
@@ -824,34 +824,34 @@ body: |
%35 = V_MOV_B32_e32 2, implicit %exec
%11 = V_LSHRREV_B32_e64 8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec
- FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%13 = V_LSHR_B32_e64 %7, 3, implicit %exec
- FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%14 = V_LSHR_B32_e64 7, %32, implicit %exec
- FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%15 = V_LSHR_B32_e64 %27, %24, implicit %exec
- FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%22 = V_LSHR_B32_e64 %6, 4, implicit %exec
- FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%23 = V_LSHR_B32_e64 %6, %33, implicit %exec
- FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%25 = V_LSHR_B32_e32 %34, %34, implicit %exec
- FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%26 = V_LSHRREV_B32_e32 11, %10, implicit %exec
- FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
%28 = V_LSHR_B32_e32 %27, %35, implicit %exec
- FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+ FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
S_ENDPGM
diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 0831d250b9e7..8611cd080e15 100644
--- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -25,7 +25,7 @@ define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
}
; GCN-LABEL: {{^}}fold_mi_v_or_0:
-; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
+; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]]
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
@@ -50,7 +50,7 @@ define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
}
; GCN-LABEL: {{^}}fold_mi_v_xor_0:
-; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
+; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]]
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
@@ -86,8 +86,8 @@ define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
}
; GCN-LABEL: {{^}}fold_mi_v_not_0:
-; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
-; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
+; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
@@ -104,8 +104,8 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
; GCN: buffer_load_dwordx2
; GCN: buffer_load_dwordx2 v{{\[}}[[VREG1_LO:[0-9]+]]:[[VREG1_HI:[0-9]+]]{{\]}}
-; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
-; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
+; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]]
; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index a29e72ea57cb..aa913ad406d2 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -25,7 +25,7 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val)
; XXX - Why 0 in register?
; FUNC-LABEL: {{^}}v_ctpop_i32:
; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
@@ -40,9 +40,9 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
-; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
@@ -61,7 +61,7 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out,
; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
; GCN: s_waitcnt
-; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
+; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
@@ -73,8 +73,8 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out,
}
; FUNC-LABEL: {{^}}v_ctpop_v2i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
; GCN: s_endpgm
; EG: BCNT_INT
@@ -87,10 +87,10 @@ define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <
}
; FUNC-LABEL: {{^}}v_ctpop_v4i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
; GCN: s_endpgm
; EG: BCNT_INT
@@ -105,14 +105,14 @@ define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <
}
; FUNC-LABEL: {{^}}v_ctpop_v8i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
; GCN: s_endpgm
; EG: BCNT_INT
@@ -131,22 +131,22 @@ define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <
}
; FUNC-LABEL: {{^}}v_ctpop_v16i32:
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
-; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
+; GCN: v_bcnt_u32_b32{{(_e64)*}}
; GCN: s_endpgm
; EG: BCNT_INT
@@ -174,7 +174,7 @@ define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out,
; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
@@ -189,7 +189,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noa
; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
@@ -206,7 +206,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)*
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
-; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -220,7 +220,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou
; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
@@ -236,7 +236,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i
; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
-; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
@@ -253,7 +253,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}}
; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
index 2610684ad9ee..f18bd9fd8174 100644
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -26,9 +26,9 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val)
; FUNC-LABEL: {{^}}v_ctpop_i64:
; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
@@ -41,9 +41,9 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs
; FUNC-LABEL: {{^}}v_ctpop_i64_user:
; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
+; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]]
; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
@@ -171,11 +171,11 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val)
; FUNC-LABEL: {{^}}v_ctpop_i128:
; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
-; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
+; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
+; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
-; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0
-; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]]
+; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0
+; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]]
; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]]
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
index 1c0e9a2f13ce..66bf9d0ffb00 100644
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -1471,11 +1471,10 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addr
; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
-; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
-; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
-; GCN: buffer_store_dword [[MUL]]
+; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
+; GCN-NEXT: buffer_store_dword [[ADD]]
+; GCN-NEXT: buffer_store_dword [[MUL]]
define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll
index 626a0b50cce8..ed36666db807 100644
--- a/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
; FIXME: Should be able to do scalar op
; GCN-LABEL: {{^}}s_fneg_f16:
@@ -129,6 +129,41 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x
ret void
}
+; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
+; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
+
+; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
+; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
+; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
+define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
+ %val = load <2 x half>, <2 x half> addrspace(1)* %in
+ %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
+ %elt0 = extractelement <2 x half> %fneg, i32 0
+ %elt1 = extractelement <2 x half> %fneg, i32 1
+
+ %fmul0 = fmul half %elt0, 4.0
+ %fadd1 = fadd half %elt1, 2.0
+ store volatile half %fmul0, half addrspace(1)* undef
+ store volatile half %fadd1, half addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]]
+; GCN: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]]
+define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
+ %val = load <2 x half>, <2 x half> addrspace(1)* %in
+ %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
+ %elt0 = extractelement <2 x half> %fneg, i32 0
+ %elt1 = extractelement <2 x half> %fneg, i32 1
+ store volatile half %elt0, half addrspace(1)* undef
+ store volatile half %elt1, half addrspace(1)* undef
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir
index c6fe6debd225..ff9fcd1c693f 100644
--- a/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -246,15 +246,15 @@ body: |
S_BRANCH %bb.1
bb.1:
- FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, implicit %exec, implicit %flat_scr
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
- FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, implicit %exec, implicit %flat_scr
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
- FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
- FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, implicit %exec, implicit %flat_scr
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
- FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, implicit %exec, implicit %flat_scr
%vgpr3 = V_MOV_B32_e32 0, implicit %exec
S_ENDPGM
diff --git a/test/CodeGen/AMDGPU/limit-coalesce.mir b/test/CodeGen/AMDGPU/limit-coalesce.mir
index 106a96e32dc3..a0d2d6c097a2 100644
--- a/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -57,15 +57,15 @@ body: |
%4.sub1 = COPY %3.sub0
undef %5.sub0 = COPY %4.sub1
%5.sub1 = COPY %4.sub0
- FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, implicit %exec, implicit %flat_scr
%6 = IMPLICIT_DEF
undef %7.sub0_sub1 = COPY %6
%7.sub2 = COPY %3.sub0
- FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, implicit %exec, implicit %flat_scr
%8 = IMPLICIT_DEF
undef %9.sub0_sub1_sub2 = COPY %8
%9.sub3 = COPY %3.sub0
- FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, implicit %exec, implicit %flat_scr
...
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index b92eb34750d9..7179d02fc6dd 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -7,7 +7,7 @@
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
-; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]]
+; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[X]], [[VY]]
define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
store <2 x half> %result, <2 x half> addrspace(1)* %out
@@ -16,7 +16,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32:
; GCN: s_load_dword [[X:s[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]]
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
store <2 x half> %result, <2 x half> addrspace(1)* %out
@@ -39,7 +39,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
-; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]]
+; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -55,7 +55,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -70,7 +70,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_imm_reg:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
-; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]]
+; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, 1.0, [[A]]
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -85,7 +85,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]]
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -103,7 +103,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_hi:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]]
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -121,7 +121,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]]
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -140,7 +140,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]]
+; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
index ab76c870796b..144c8f428ab0 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@@ -2,9 +2,9 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}mbcnt_intrinsics:
-; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
+; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[LO:v[0-9]+]], -1, 0
; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
-; VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
+; VI: v_mbcnt_hi_u32_b32 {{v[0-9]+}}, -1, [[LO]]
define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) {
main_body:
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index eb4066a2a0a8..5f1fb0e2d732 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
; GCN-LABEL: {{^}}madak_f32:
; GCN: buffer_load_dword [[VA:v[0-9]+]]
; GCN: buffer_load_dword [[VB:v[0-9]+]]
-; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -63,7 +63,7 @@ define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, flo
; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
; GCN: buffer_load_dword [[VA:v[0-9]+]]
-; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
+; GCN: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -198,7 +198,7 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia
; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
; GCN: buffer_load_dword [[VGPR:v[0-9]+]]
-; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
+; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
; GCN: buffer_store_dword [[MUL]]
define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
diff --git a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
index 9c43a6dc60f4..d7655993a2d9 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
@@ -1,26 +1,26 @@
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
; CHECK-LABEL: @volatile_load(
-; CHECK: alloca [5 x i32]
+; CHECK: alloca [4 x i32]
; CHECK: load volatile i32, i32*
define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
entry:
- %stack = alloca [5 x i32], align 4
+ %stack = alloca [4 x i32], align 4
%tmp = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+ %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32]* %stack, i32 0, i32 %tmp
%load = load volatile i32, i32* %arrayidx1
store i32 %load, i32 addrspace(1)* %out
ret void
}
; CHECK-LABEL: @volatile_store(
-; CHECK: alloca [5 x i32]
+; CHECK: alloca [4 x i32]
; CHECK: store volatile i32 %tmp, i32*
define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
entry:
- %stack = alloca [5 x i32], align 4
+ %stack = alloca [4 x i32], align 4
%tmp = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+ %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32]* %stack, i32 0, i32 %tmp
store volatile i32 %tmp, i32* %arrayidx1
ret void
}
diff --git a/test/CodeGen/AMDGPU/v_madak_f16.ll b/test/CodeGen/AMDGPU/v_madak_f16.ll
index bfb10503aaea..0148ff470b78 100644
--- a/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -4,7 +4,7 @@
; GCN-LABEL: {{^}}madak_f16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; VI: v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}}
+; VI: v_madak_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}}
; VI: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @madak_f16(
diff --git a/test/CodeGen/AMDGPU/waitcnt.mir b/test/CodeGen/AMDGPU/waitcnt.mir
index 38662e83b359..f754415dccb4 100644
--- a/test/CodeGen/AMDGPU/waitcnt.mir
+++ b/test/CodeGen/AMDGPU/waitcnt.mir
@@ -51,21 +51,21 @@ name: flat_zero_waitcnt
body: |
bb.0:
successors: %bb.1
- %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.global4)
- %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.global4)
+ %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
%vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
S_BRANCH %bb.1
bb.1:
successors: %bb.2
- %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
- %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16)
%vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
S_BRANCH %bb.2
bb.2:
- %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.flat4)
- %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.flat16)
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.flat4)
+ %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.flat16)
%vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
S_ENDPGM
...
@@ -86,11 +86,11 @@ name: single_fallthrough_successor_no_end_block_wait
body: |
bb.0:
successors: %bb.1
- %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr
bb.1:
%vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
- FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
...
---
@@ -114,15 +114,15 @@ name: single_branch_successor_not_next_block
body: |
bb.0:
successors: %bb.2
- %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
+ %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr
S_BRANCH %bb.2
bb.1:
- FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
bb.2:
%vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
- FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+ FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, implicit %exec, implicit %flat_scr
S_ENDPGM
...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
index 83ab2659ef4a..72c3b715d36e 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
@@ -4,6 +4,8 @@
define void @test_sext_s1() { ret void }
define void @test_sext_s8() { ret void }
define void @test_zext_s16() { ret void }
+ define void @test_anyext_s8() { ret void }
+ define void @test_anyext_s16() { ret void }
define void @test_trunc_s32_16() { ret void }
@@ -149,6 +151,58 @@ body: |
; CHECK: BX_RET 14, _, implicit %r0
...
---
+name: test_anyext_s8
+# CHECK-LABEL: name: test_anyext_s8
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+ - { id: 1, class: gprb }
+body: |
+ bb.0:
+ liveins: %r0
+
+ %0(s8) = COPY %r0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+
+ %1(s32) = G_ANYEXT %0(s8)
+ ; CHECK: [[VREGEXT:%[0-9]+]] = COPY [[VREGX]]
+
+ %r0 = COPY %1(s32)
+ ; CHECK: %r0 = COPY [[VREGEXT]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
+name: test_anyext_s16
+# CHECK-LABEL: name: test_anyext_s16
+legalized: true
+regBankSelected: true
+selected: false
+# CHECK: selected: true
+registers:
+ - { id: 0, class: gprb }
+ - { id: 1, class: gprb }
+body: |
+ bb.0:
+ liveins: %r0
+
+ %0(s16) = COPY %r0
+ ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0
+
+ %1(s32) = G_ANYEXT %0(s16)
+ ; CHECK: [[VREGEXT:%[0-9]+]] = COPY [[VREGX]]
+
+ %r0 = COPY %1(s32)
+ ; CHECK: %r0 = COPY [[VREGEXT]]
+
+ BX_RET 14, _, implicit %r0
+ ; CHECK: BX_RET 14, _, implicit %r0
+...
+---
name: test_trunc_s32_16
# CHECK-LABEL: name: test_trunc_s32_16
legalized: true
@@ -187,9 +241,15 @@ registers:
- { id: 0, class: gprb }
- { id: 1, class: gprb }
- { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+ - { id: 4, class: gprb }
+ - { id: 5, class: gprb }
# CHECK-DAG: id: 0, class: gpr
# CHECK-DAG: id: 1, class: gpr
# CHECK-DAG: id: 2, class: gpr
+# CHECK-DAG: id: 3, class: gpr
+# CHECK-DAG: id: 4, class: gpr
+# CHECK-DAG: id: 5, class: gpr
body: |
bb.0:
liveins: %r0, %r1
@@ -200,11 +260,20 @@ body: |
%1(s8) = COPY %r1
; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
- %2(s8) = G_ADD %0, %1
- ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGX]], [[VREGY]], 14, _, _
+ %2(s32) = G_ANYEXT %0(s8)
+ ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
- %r0 = COPY %2(s8)
- ; CHECK: %r0 = COPY [[VREGSUM]]
+ %3(s32) = G_ANYEXT %1(s8)
+ ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+ %4(s32) = G_ADD %2, %3
+ ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+ %5(s8) = G_TRUNC %4(s32)
+ ; CHECK: [[VREGSUMTR:%[0-9]+]] = COPY [[VREGSUM]]
+
+ %r0 = COPY %5(s8)
+ ; CHECK: %r0 = COPY [[VREGSUMTR]]
BX_RET 14, _, implicit %r0
; CHECK: BX_RET 14, _, implicit %r0
@@ -220,9 +289,15 @@ registers:
- { id: 0, class: gprb }
- { id: 1, class: gprb }
- { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+ - { id: 4, class: gprb }
+ - { id: 5, class: gprb }
# CHECK-DAG: id: 0, class: gpr
# CHECK-DAG: id: 1, class: gpr
# CHECK-DAG: id: 2, class: gpr
+# CHECK-DAG: id: 3, class: gpr
+# CHECK-DAG: id: 4, class: gpr
+# CHECK-DAG: id: 5, class: gpr
body: |
bb.0:
liveins: %r0, %r1
@@ -233,11 +308,20 @@ body: |
%1(s16) = COPY %r1
; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
- %2(s16) = G_ADD %0, %1
- ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGX]], [[VREGY]], 14, _, _
+ %2(s32) = G_ANYEXT %0(s16)
+ ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
- %r0 = COPY %2(s16)
- ; CHECK: %r0 = COPY [[VREGSUM]]
+ %3(s32) = G_ANYEXT %1(s16)
+ ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+ %4(s32) = G_ADD %2, %3
+ ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+ %5(s16) = G_TRUNC %4(s32)
+ ; CHECK: [[VREGSUMTR:%[0-9]+]] = COPY [[VREGSUM]]
+
+ %r0 = COPY %5(s16)
+ ; CHECK: %r0 = COPY [[VREGSUMTR]]
BX_RET 14, _, implicit %r0
; CHECK: BX_RET 14, _, implicit %r0
@@ -352,9 +436,15 @@ registers:
- { id: 0, class: gprb }
- { id: 1, class: gprb }
- { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+ - { id: 4, class: gprb }
+ - { id: 5, class: gprb }
# CHECK-DAG: id: 0, class: gpr
# CHECK-DAG: id: 1, class: gpr
# CHECK-DAG: id: 2, class: gpr
+# CHECK-DAG: id: 3, class: gpr
+# CHECK-DAG: id: 4, class: gpr
+# CHECK-DAG: id: 5, class: gpr
body: |
bb.0:
liveins: %r0, %r1
@@ -365,11 +455,20 @@ body: |
%1(s8) = COPY %r1
; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
- %2(s8) = G_SUB %0, %1
- ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _
+ %2(s32) = G_ANYEXT %0(s8)
+ ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
- %r0 = COPY %2(s8)
- ; CHECK: %r0 = COPY [[VREGRES]]
+ %3(s32) = G_ANYEXT %1(s8)
+ ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+ %4(s32) = G_SUB %2, %3
+ ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+ %5(s8) = G_TRUNC %4(s32)
+ ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]]
+
+ %r0 = COPY %5(s8)
+ ; CHECK: %r0 = COPY [[VREGRESTR]]
BX_RET 14, _, implicit %r0
; CHECK: BX_RET 14, _, implicit %r0
@@ -385,9 +484,15 @@ registers:
- { id: 0, class: gprb }
- { id: 1, class: gprb }
- { id: 2, class: gprb }
+ - { id: 3, class: gprb }
+ - { id: 4, class: gprb }
+ - { id: 5, class: gprb }
# CHECK-DAG: id: 0, class: gpr
# CHECK-DAG: id: 1, class: gpr
# CHECK-DAG: id: 2, class: gpr
+# CHECK-DAG: id: 3, class: gpr
+# CHECK-DAG: id: 4, class: gpr
+# CHECK-DAG: id: 5, class: gpr
body: |
bb.0:
liveins: %r0, %r1
@@ -398,11 +503,20 @@ body: |
%1(s16) = COPY %r1
; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
- %2(s16) = G_SUB %0, %1
- ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _
+ %2(s32) = G_ANYEXT %0(s16)
+ ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
- %r0 = COPY %2(s16)
- ; CHECK: %r0 = COPY [[VREGRES]]
+ %3(s32) = G_ANYEXT %1(s16)
+ ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+ %4(s32) = G_SUB %2, %3
+ ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+ %5(s16) = G_TRUNC %4(s32)
+ ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]]
+
+ %r0 = COPY %5(s16)
+ ; CHECK: %r0 = COPY [[VREGRESTR]]
BX_RET 14, _, implicit %r0
; CHECK: BX_RET 14, _, implicit %r0
@@ -451,9 +565,15 @@ registers:
- { id: 0, class: gprb }
- { id: 1, class: gprb }
- { id: 2, class: gprb }
-# CHECK-DAG: id: 0, class: gprnopc
-# CHECK-DAG: id: 1, class: gprnopc
+ - { id: 3, class: gprb }
+ - { id: 4, class: gprb }
+ - { id: 5, class: gprb }
+# CHECK-DAG: id: 0, class: gpr
+# CHECK-DAG: id: 1, class: gpr
# CHECK-DAG: id: 2, class: gprnopc
+# CHECK-DAG: id: 3, class: gprnopc
+# CHECK-DAG: id: 4, class: gprnopc
+# CHECK-DAG: id: 5, class: gpr
body: |
bb.0:
liveins: %r0, %r1
@@ -464,11 +584,20 @@ body: |
%1(s8) = COPY %r1
; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
- %2(s8) = G_MUL %0, %1
- ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _
+ %2(s32) = G_ANYEXT %0(s8)
+ ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
- %r0 = COPY %2(s8)
- ; CHECK: %r0 = COPY [[VREGRES]]
+ %3(s32) = G_ANYEXT %1(s8)
+ ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+ %4(s32) = G_MUL %2, %3
+ ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+ %5(s8) = G_TRUNC %4(s32)
+ ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]]
+
+ %r0 = COPY %5(s8)
+ ; CHECK: %r0 = COPY [[VREGRESTR]]
BX_RET 14, _, implicit %r0
; CHECK: BX_RET 14, _, implicit %r0
@@ -484,9 +613,15 @@ registers:
- { id: 0, class: gprb }
- { id: 1, class: gprb }
- { id: 2, class: gprb }
-# CHECK-DAG: id: 0, class: gprnopc
-# CHECK-DAG: id: 1, class: gprnopc
+ - { id: 3, class: gprb }
+ - { id: 4, class: gprb }
+ - { id: 5, class: gprb }
+# CHECK-DAG: id: 0, class: gpr
+# CHECK-DAG: id: 1, class: gpr
# CHECK-DAG: id: 2, class: gprnopc
+# CHECK-DAG: id: 3, class: gprnopc
+# CHECK-DAG: id: 4, class: gprnopc
+# CHECK-DAG: id: 5, class: gpr
body: |
bb.0:
liveins: %r0, %r1
@@ -497,11 +632,20 @@ body: |
%1(s16) = COPY %r1
; CHECK: [[VREGY:%[0-9]+]] = COPY %r1
- %2(s16) = G_MUL %0, %1
- ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _
+ %2(s32) = G_ANYEXT %0(s16)
+ ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]]
- %r0 = COPY %2(s16)
- ; CHECK: %r0 = COPY [[VREGRES]]
+ %3(s32) = G_ANYEXT %1(s16)
+ ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]]
+
+ %4(s32) = G_MUL %2, %3
+ ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGXEXT]], [[VREGYEXT]], 14, _, _
+
+ %5(s16) = G_TRUNC %4(s32)
+ ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]]
+
+ %r0 = COPY %5(s16)
+ ; CHECK: %r0 = COPY [[VREGRESTR]]
BX_RET 14, _, implicit %r0
; CHECK: BX_RET 14, _, implicit %r0
diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index 44fe7410b42c..53577dbd76f6 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -421,7 +421,7 @@ entry:
define arm_aapcscc void @test_indirect_call(void() *%fptr) {
; CHECK-LABEL: name: test_indirect_call
; CHECK: [[FPTR:%[0-9]+]](p0) = COPY %r0
-; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
; CHECK: BLX [[FPTR]](p0), csr_aapcs, implicit-def %lr, implicit %sp
; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
entry:
@@ -433,7 +433,7 @@ declare arm_aapcscc void @call_target()
define arm_aapcscc void @test_direct_call() {
; CHECK-LABEL: name: test_direct_call
-; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
; CHECK: BLX @call_target, csr_aapcs, implicit-def %lr, implicit %sp
; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
entry:
@@ -447,7 +447,7 @@ define arm_aapcscc i32* @test_call_simple_reg_params(i32 *%a, i32 %b) {
; CHECK-LABEL: name: test_call_simple_reg_params
; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0
; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1
-; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
; CHECK-DAG: %r0 = COPY [[BVREG]]
; CHECK-DAG: %r1 = COPY [[AVREG]]
; CHECK: BLX @simple_reg_params_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0
@@ -466,7 +466,7 @@ define arm_aapcscc i32* @test_call_simple_stack_params(i32 *%a, i32 %b) {
; CHECK-LABEL: name: test_call_simple_stack_params
; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0
; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1
-; CHECK: ADJCALLSTACKDOWN 8, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 8, 0, 14, _, implicit-def %sp, implicit %sp
; CHECK-DAG: %r0 = COPY [[BVREG]]
; CHECK-DAG: %r1 = COPY [[AVREG]]
; CHECK-DAG: %r2 = COPY [[BVREG]]
@@ -496,7 +496,7 @@ define arm_aapcscc signext i16 @test_call_ext_params(i8 %a, i16 %b, i1 %c) {
; CHECK-DAG: [[AVREG:%[0-9]+]](s8) = COPY %r0
; CHECK-DAG: [[BVREG:%[0-9]+]](s16) = COPY %r1
; CHECK-DAG: [[CVREG:%[0-9]+]](s1) = COPY %r2
-; CHECK: ADJCALLSTACKDOWN 20, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 20, 0, 14, _, implicit-def %sp, implicit %sp
; CHECK: [[SEXTA:%[0-9]+]](s32) = G_SEXT [[AVREG]](s8)
; CHECK: %r0 = COPY [[SEXTA]]
; CHECK: [[ZEXTA:%[0-9]+]](s32) = G_ZEXT [[AVREG]](s8)
@@ -547,7 +547,7 @@ define arm_aapcs_vfpcc double @test_call_vfpcc_fp_params(double %a, float %b) {
; CHECK-LABEL: name: test_call_vfpcc_fp_params
; CHECK-DAG: [[AVREG:%[0-9]+]](s64) = COPY %d0
; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %s2
-; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
; CHECK-DAG: %s0 = COPY [[BVREG]]
; CHECK-DAG: %d1 = COPY [[AVREG]]
; CHECK: BLX @vfpcc_fp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %s0, implicit %d1, implicit-def %d0
@@ -569,7 +569,7 @@ define arm_aapcscc double @test_call_aapcs_fp_params(double %a, float %b) {
; LITTLE-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A1]](s32), 0, [[A2]](s32), 32
; BIG-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A2]](s32), 0, [[A1]](s32), 32
; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r2
-; CHECK: ADJCALLSTACKDOWN 16, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 16, 0, 14, _, implicit-def %sp, implicit %sp
; CHECK-DAG: %r0 = COPY [[BVREG]]
; CHECK-DAG: [[A1:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 0
; CHECK-DAG: [[A2:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 32
@@ -608,7 +608,7 @@ declare arm_aapcscc float @different_call_conv_target(float)
define arm_aapcs_vfpcc float @test_call_different_call_conv(float %x) {
; CHECK-LABEL: name: test_call_different_call_conv
; CHECK: [[X:%[0-9]+]](s32) = COPY %s0
-; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
; CHECK: %r0 = COPY [[X]]
; CHECK: BLX @different_call_conv_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit-def %r0
; CHECK: [[R:%[0-9]+]](s32) = COPY %r0
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index 625d35acf17b..f6ac92597cb2 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -91,8 +91,9 @@ body: |
%0(s8) = COPY %r0
%1(s8) = COPY %r1
%2(s8) = G_ADD %0, %1
- ; G_ADD with s8 is legal, so we should find it unchanged in the output
- ; CHECK: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
+ ; G_ADD with s8 should widen
+ ; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
BX_RET 14, _, implicit %r0
...
@@ -115,8 +116,9 @@ body: |
%0(s16) = COPY %r0
%1(s16) = COPY %r1
%2(s16) = G_ADD %0, %1
- ; G_ADD with s16 is legal, so we should find it unchanged in the output
- ; CHECK: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
+ ; G_ADD with s16 should widen
+ ; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}}
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
BX_RET 14, _, implicit %r0
@@ -165,8 +167,9 @@ body: |
%0(s8) = COPY %r0
%1(s8) = COPY %r1
%2(s8) = G_SUB %0, %1
- ; G_SUB with s8 is legal, so we should find it unchanged in the output
- ; CHECK: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
+ ; G_SUB with s8 should widen
+ ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
BX_RET 14, _, implicit %r0
...
@@ -189,8 +192,9 @@ body: |
%0(s16) = COPY %r0
%1(s16) = COPY %r1
%2(s16) = G_SUB %0, %1
- ; G_SUB with s16 is legal, so we should find it unchanged in the output
- ; CHECK: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
+ ; G_SUB with s16 should widen
+ ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}}
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
BX_RET 14, _, implicit %r0
@@ -239,8 +243,9 @@ body: |
%0(s8) = COPY %r0
%1(s8) = COPY %r1
%2(s8) = G_MUL %0, %1
- ; G_MUL with s8 is legal, so we should find it unchanged in the output
- ; CHECK: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
+ ; G_MUL with s8 should widen
+ ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
+ ; CHECK-NOT: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s8)
BX_RET 14, _, implicit %r0
...
@@ -263,8 +268,9 @@ body: |
%0(s16) = COPY %r0
%1(s16) = COPY %r1
%2(s16) = G_MUL %0, %1
- ; G_MUL with s16 is legal, so we should find it unchanged in the output
- ; CHECK: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
+ ; G_MUL with s16 should widen
+ ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}}
+ ; CHECK-NOT: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}}
%r0 = COPY %2(s16)
BX_RET 14, _, implicit %r0
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index 4e94fb4e3481..dfccc47c277c 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -25,6 +25,9 @@
define void @test_constants() { ret void }
+ define void @test_anyext_s8_32() { ret void }
+ define void @test_anyext_s16_32() { ret void }
+
define void @test_trunc_s32_16() { ret void }
define void @test_fadd_s32() #0 { ret void }
@@ -71,19 +74,28 @@ selected: false
# CHECK: - { id: 0, class: gprb }
# CHECK: - { id: 1, class: gprb }
# CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
body: |
bb.0:
liveins: %r0, %r1
%0(s16) = COPY %r0
%1(s16) = COPY %r1
- %2(s16) = G_ADD %0, %1
- %r0 = COPY %2(s16)
+ %2(s32) = G_ANYEXT %0(s16)
+ %3(s32) = G_ANYEXT %1(s16)
+ %4(s32) = G_ADD %2, %3
+ %5(s16) = G_TRUNC %4(s32)
+ %r0 = COPY %5(s16)
BX_RET 14, _, implicit %r0
...
@@ -97,19 +109,28 @@ selected: false
# CHECK: - { id: 0, class: gprb }
# CHECK: - { id: 1, class: gprb }
# CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
body: |
bb.0:
liveins: %r0, %r1
%0(s8) = COPY %r0
%1(s8) = COPY %r1
- %2(s8) = G_ADD %0, %1
- %r0 = COPY %2(s8)
+ %2(s32) = G_ANYEXT %0(s8)
+ %3(s32) = G_ANYEXT %1(s8)
+ %4(s32) = G_ADD %2, %3
+ %5(s8) = G_TRUNC %4(s32)
+ %r0 = COPY %5(s8)
BX_RET 14, _, implicit %r0
...
@@ -123,19 +144,28 @@ selected: false
# CHECK: - { id: 0, class: gprb }
# CHECK: - { id: 1, class: gprb }
# CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
body: |
bb.0:
liveins: %r0, %r1
%0(s1) = COPY %r0
%1(s1) = COPY %r1
- %2(s1) = G_ADD %0, %1
- %r0 = COPY %2(s1)
+ %2(s32) = G_ANYEXT %0(s1)
+ %3(s32) = G_ANYEXT %1(s1)
+ %4(s32) = G_ADD %2, %3
+ %5(s1) = G_TRUNC %4(s32)
+ %r0 = COPY %5(s1)
BX_RET 14, _, implicit %r0
...
@@ -175,19 +205,28 @@ selected: false
# CHECK: - { id: 0, class: gprb }
# CHECK: - { id: 1, class: gprb }
# CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
body: |
bb.0:
liveins: %r0, %r1
%0(s16) = COPY %r0
%1(s16) = COPY %r1
- %2(s16) = G_SUB %0, %1
- %r0 = COPY %2(s16)
+ %2(s32) = G_ANYEXT %0(s16)
+ %3(s32) = G_ANYEXT %1(s16)
+ %4(s32) = G_SUB %2, %3
+ %5(s16) = G_TRUNC %4(s32)
+ %r0 = COPY %5(s16)
BX_RET 14, _, implicit %r0
...
@@ -201,19 +240,28 @@ selected: false
# CHECK: - { id: 0, class: gprb }
# CHECK: - { id: 1, class: gprb }
# CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
body: |
bb.0:
liveins: %r0, %r1
%0(s8) = COPY %r0
%1(s8) = COPY %r1
- %2(s8) = G_SUB %0, %1
- %r0 = COPY %2(s8)
+ %2(s32) = G_ANYEXT %0(s8)
+ %3(s32) = G_ANYEXT %1(s8)
+ %4(s32) = G_SUB %2, %3
+ %5(s8) = G_TRUNC %4(s32)
+ %r0 = COPY %5(s8)
BX_RET 14, _, implicit %r0
...
@@ -253,19 +301,28 @@ selected: false
# CHECK: - { id: 0, class: gprb }
# CHECK: - { id: 1, class: gprb }
# CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
body: |
bb.0:
liveins: %r0, %r1
%0(s16) = COPY %r0
%1(s16) = COPY %r1
- %2(s16) = G_MUL %0, %1
- %r0 = COPY %2(s16)
+ %2(s32) = G_ANYEXT %0(s16)
+ %3(s32) = G_ANYEXT %1(s16)
+ %4(s32) = G_MUL %2, %3
+ %5(s16) = G_TRUNC %4(s32)
+ %r0 = COPY %5(s16)
BX_RET 14, _, implicit %r0
...
@@ -279,19 +336,28 @@ selected: false
# CHECK: - { id: 0, class: gprb }
# CHECK: - { id: 1, class: gprb }
# CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+ - { id: 5, class: _ }
body: |
bb.0:
liveins: %r0, %r1
%0(s8) = COPY %r0
%1(s8) = COPY %r1
- %2(s8) = G_MUL %0, %1
- %r0 = COPY %2(s8)
+ %2(s32) = G_ANYEXT %0(s8)
+ %3(s32) = G_ANYEXT %1(s8)
+ %4(s32) = G_MUL %2, %3
+ %5(s8) = G_TRUNC %4(s32)
+ %r0 = COPY %5(s8)
BX_RET 14, _, implicit %r0
...
@@ -500,6 +566,48 @@ body: |
BX_RET 14, _, implicit %r0
...
---
+name: test_anyext_s8_32
+# CHECK-LABEL: name: test_anyext_s8_32
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0:
+ liveins: %r0
+
+ %0(s8) = COPY %r0
+ %1(s32) = G_ANYEXT %0(s8)
+ %r0 = COPY %1(s32)
+ BX_RET 14, _, implicit %r0
+...
+---
+name: test_anyext_s16_32
+# CHECK-LABEL: name: test_anyext_s16_32
+legalized: true
+regBankSelected: false
+selected: false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0:
+ liveins: %r0
+
+ %0(s16) = COPY %r0
+ %1(s32) = G_ANYEXT %0(s16)
+ %r0 = COPY %1(s32)
+ BX_RET 14, _, implicit %r0
+...
+---
name: test_trunc_s32_16
# CHECK-LABEL: name: test_trunc_s32_16
legalized: true
diff --git a/test/CodeGen/ARM/divmod-eabi.ll b/test/CodeGen/ARM/divmod-eabi.ll
index ce5a1df05e3f..77ffc46e6a69 100644
--- a/test/CodeGen/ARM/divmod-eabi.ll
+++ b/test/CodeGen/ARM/divmod-eabi.ll
@@ -16,17 +16,15 @@
; RUN: llc -mtriple armv7-linux-gnueabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI
; RUN: llc -mtriple armv7-linux-musleabi %s -o - | FileCheck %s --check-prefix=EABI
; RUN: llc -mtriple armv7-linux-musleabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI
-; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefixes=DARWIN,DARWIN-DEFAULT
-; RUN: llc -mtriple armv7-apple-darwin %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=DARWIN,DARWIN-O0
-; FIXME: long-term, we will use "-apple-macho" and won't need this exception:
-; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - | FileCheck %s --check-prefixes=DARWIN,DARWIN-DEFAULT
-; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=DARWIN,DARWIN-O0
+; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefixes=DARWIN
+; RUN: llc -mtriple armv7-apple-darwin %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=DARWIN-O0
; RUN: llc -mtriple thumbv7-windows %s -o - | FileCheck %s --check-prefixes=WINDOWS,WINDOWS-DEFAULT
; RUN: llc -mtriple thumbv7-windows %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=WINDOWS,WINDOWS-O0
define signext i16 @f16(i16 signext %a, i16 signext %b) {
; EABI-LABEL: f16:
; DARWIN-LABEL: f16:
+; DARWIN-O0-LABEL: f16:
; WINDOWS-LABEL: f16:
entry:
%conv = sext i16 %a to i32
@@ -36,11 +34,9 @@ entry:
; EABI: __aeabi_idivmod
; EABI: mov [[div:r[0-9]+]], r0
; EABI: mov [[rem:r[0-9]+]], r1
-; DARWIN: ___divsi3
-; DARWIN: mov [[div:r[0-9]+]], r0
-; DARWIN: __modsi3
-; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]]
-; DARWIN-O0: mov [[rem:r[0-9]+]], r0
+; DARWIN: __divmodsi4
+; DARWIN-O0: __divsi3
+; DARWIN-O0: __modsi3
; WINDOWS: __rt_sdiv
; WINDOWS: __rt_sdiv
; WINDOWS-DEFAULT: add [[sum:r[0-9]+]], r1
@@ -48,16 +44,13 @@ entry:
%rem8 = srem i32 %conv1, %conv
; EABI: __aeabi_idivmod
; DARWIN: __modsi3
+; DARWIN-O0: __modsi3
; WINDOWS: __rt_sdiv
%add = add nsw i32 %rem, %div
%add13 = add nsw i32 %add, %rem8
%conv14 = trunc i32 %add13 to i16
; EABI: add r0{{.*}}r1
; EABI: sxth r0, r0
-; DARWIN-DEFAULT: add [[res:r[0-9]+]], [[sum]], r0
-; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]]
-; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0
-; DARWIN: sxth r0, [[res]]
; WINDOWS-DEFAULT: adds [[sum1:r[0-9]+]], [[sum]], r1
; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]],
; WINDOWS-O0: add [[sum1:r[0-9]+]], r1
@@ -68,6 +61,7 @@ entry:
define i32 @f32(i32 %a, i32 %b) {
; EABI-LABEL: f32:
; DARWIN-LABEL: f32:
+; DARWIN-O0-LABEL: f32:
; WINDOWS-LABEL: f32:
entry:
%div = sdiv i32 %a, %b
@@ -75,11 +69,9 @@ entry:
; EABI: __aeabi_idivmod
; EABI: mov [[div:r[0-9]+]], r0
; EABI: mov [[rem:r[0-9]+]], r1
-; DARWIN: ___divsi3
-; DARWIN: mov [[div:r[0-9]+]], r0
-; DARWIN: __modsi3
-; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]]
-; DARWIN-O0: mov [[rem:r[0-9]+]], r0
+; DARWIN: ___divmodsi4
+; DARWIN-O0: __divsi3
+; DARWIN-O0: __modsi3
; WINDOWS: __rt_sdiv
; WINDOWS: mov [[div:r[0-9]+]], r0
; WINDOWS: __rt_sdiv
@@ -87,13 +79,11 @@ entry:
%rem1 = srem i32 %b, %a
; EABI: __aeabi_idivmod
; DARWIN: __modsi3
+; DARWIN-O0: __modsi3
; WINDOWS: __rt_sdiv
%add = add nsw i32 %rem, %div
%add2 = add nsw i32 %add, %rem1
; EABI: add r0{{.*}}r1
-; DARWIN-DEFAULT: add r0, [[sum]], r0
-; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]]
-; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0
; WINDOWS-DEFAULT: adds r0, [[div]], r1
; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]], [[div]]
; WINDOWS-O0: add [[sum]], r1
@@ -103,16 +93,15 @@ entry:
define i32 @uf(i32 %a, i32 %b) {
; EABI-LABEL: uf:
; DARWIN-LABEL: uf:
+; DARWIN-O0-LABEL: uf:
; WINDOWS-LABEL: uf:
entry:
%div = udiv i32 %a, %b
%rem = urem i32 %a, %b
; EABI: __aeabi_uidivmod
-; DARWIN: ___udivsi3
-; DARWIN: mov [[div:r[0-9]+]], r0
-; DARWIN: __umodsi3
-; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]]
-; DARWIN-O0: mov [[rem:r[0-9]+]], r0
+; DARWIN: __udivmodsi4
+; DARWIN-O0: __udivsi3
+; DARWIN-O0: __umodsi3
; WINDOWS: __rt_udiv
; WINDOWS: mov [[div:r[0-9]+]], r0
; WINDOWS: __rt_udiv
@@ -120,13 +109,11 @@ entry:
%rem1 = urem i32 %b, %a
; EABI: __aeabi_uidivmod
; DARWIN: __umodsi3
+; DARWIN-O0: __umodsi3
; WINDOWS: __rt_udiv
%add = add nuw i32 %rem, %div
%add2 = add nuw i32 %add, %rem1
; EABI: add r0{{.*}}r1
-; DARWIN-DEFAULT: add r0, [[sum]], r0
-; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]]
-; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0
; WINDOWS-DEFAULT: adds [[sum:r[0-9]+]], [[div]], r1
; WINDOWS-O0: adds [[sum:r[0-9]+]],
; WINDOWS-O0: add [[sum]], r1
@@ -136,6 +123,7 @@ entry:
define i64 @longf(i64 %a, i64 %b) {
; EABI-LABEL: longf:
; DARWIN-LABEL: longf:
+; DARWIN-O0-LABEL: longf:
; WINDOWS-LABEL: longf:
entry:
%div = sdiv i64 %a, %b
@@ -148,6 +136,8 @@ entry:
; DARWIN: mov [[div1:r[0-9]+]], r0
; DARWIN: mov [[div2:r[0-9]+]], r1
; DARWIN: __moddi3
+; DARWIN-O0: __divdi3
+; DARWIN-O0: __moddi3
; WINDOWS: __rt_sdiv64
%add = add nsw i64 %rem, %div
; DARWIN: adds r0{{.*}}[[div1]]
@@ -160,20 +150,19 @@ entry:
define i16 @shortf(i16 %a, i16 %b) {
; EABI-LABEL: shortf:
; DARWIN-LABEL: shortf:
+; DARWIN-O0-LABEL: shortf:
; WINDOWS-LABEL: shortf:
entry:
%div = sdiv i16 %a, %b
%rem = srem i16 %a, %b
; EABI: __aeabi_idivmod
-; DARWIN: ___divsi3
-; DARWIN: mov [[div1:r[0-9]+]], r0
-; DARWIN: __modsi3
+; DARWIN: ___divmodsi4
+; DARWIN-O0: __divmodsi4
; WINDOWS: __rt_sdiv
; WINDOWS: mov [[div:r[0-9]+]], r0
; WINDOWS: __rt_sdiv
%add = add nsw i16 %rem, %div
; EABI: add r0, r1
-; DARWIN: add r0{{.*}}[[div1]]
; WINDOWS: adds r0, r1, [[div]]
ret i16 %add
}
@@ -181,20 +170,20 @@ entry:
define i32 @g1(i32 %a, i32 %b) {
; EABI-LABEL: g1:
; DARWIN-LABEL: g1:
+; DARWIN-O0-LABEL: g1:
; WINDOWS-LABEL: g1:
entry:
%div = sdiv i32 %a, %b
%rem = srem i32 %a, %b
; EABI: __aeabi_idivmod
-; DARWIN: ___divsi3
-; DARWIN: mov [[sum:r[0-9]+]], r0
-; DARWIN: __modsi3
+; DARWIN: ___divmodsi4
+; DARWIN-O0: __divsi3
+; DARWIN-O0: __modsi3
; WINDOWS: __rt_sdiv
; WINDOWS: mov [[div:r[0-9]+]], r0
; WINDOWS: __rt_sdiv
%add = add nsw i32 %rem, %div
; EABI: add r0{{.*}}r1
-; DARWIN: add r0{{.*}}[[sum]]
; WINDOWS: adds r0, r1, [[div]]
ret i32 %add
}
@@ -203,11 +192,13 @@ entry:
define i32 @g2(i32 %a, i32 %b) {
; EABI-LABEL: g2:
; DARWIN-LABEL: g2:
+; DARWIN-O0-LABEL: g2:
; WINDOWS-LABEL: g2:
entry:
%rem = srem i32 %a, %b
; EABI: __aeabi_idivmod
; DARWIN: __modsi3
+; DARWIN-O0: __modsi3
; WINDOWS: __rt_sdiv
ret i32 %rem
; EABI: mov r0, r1
@@ -217,6 +208,7 @@ entry:
define i32 @g3(i32 %a, i32 %b) {
; EABI-LABEL: g3:
; DARWIN-LABEL: g3:
+; DARWIN-O0-LABEL: g3:
; WINDOWS-LABEL: g3:
entry:
%rem = srem i32 %a, %b
@@ -224,11 +216,13 @@ entry:
; EABI: mov [[mod:r[0-9]+]], r1
; DARWIN: __modsi3
; DARWIN: mov [[sum:r[0-9]+]], r0
+; DARWIN-O0: __modsi3
; WINDOWS: __rt_sdiv
; WINDOWS: mov [[rem:r[0-9]+]], r1
%rem1 = srem i32 %b, %rem
; EABI: __aeabi_idivmod
; DARWIN: __modsi3
+; DARWIN-O0: __modsi3
; WINDOWS: __rt_sdiv
%add = add nsw i32 %rem1, %rem
; EABI: add r0, r1, [[mod]]
@@ -240,6 +234,7 @@ entry:
define i32 @g4(i32 %a, i32 %b) {
; EABI-LABEL: g4:
; DARWIN-LABEL: g4:
+; DARWIN-O0-LABEL: g4:
; WINDOWS-LABEL: g4:
entry:
%div = sdiv i32 %a, %b
@@ -247,11 +242,13 @@ entry:
; EABI: mov [[div:r[0-9]+]], r0
; DARWIN: ___divsi3
; DARWIN: mov [[sum:r[0-9]+]], r0
+; DARWIN-O0: __divsi3
; WINDOWS: __rt_sdiv
; WINDOWS: mov [[div:r[0-9]+]], r0
%rem = srem i32 %b, %div
; EABI: __aeabi_idivmod
; DARWIN: __modsi3
+; DARWIN-O0: __modsi3
; WINDOWS: __rt_sdiv
%add = add nsw i32 %rem, %div
; EABI: add r0, r1, [[div]]
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
index 9336d0c477d1..ffc1ed09cbf0 100644
--- a/test/CodeGen/ARM/divmod.ll
+++ b/test/CodeGen/ARM/divmod.ll
@@ -1,5 +1,6 @@
; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=swift | FileCheck %s -check-prefix=SWIFT
+; RUN: llc < %s -mtriple=thumbv7-apple-macho -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
; rdar://12481395
diff --git a/test/CodeGen/AVR/select-mbb-placement-bug.ll b/test/CodeGen/AVR/select-mbb-placement-bug.ll
new file mode 100644
index 000000000000..ca7ec1ab831c
--- /dev/null
+++ b/test/CodeGen/AVR/select-mbb-placement-bug.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mcpu=atmega328p < %s -march=avr | FileCheck %s
+
+; CHECK-LABEL: loopy
+define internal fastcc void @loopy() {
+
+; In this case, when we expand `Select8`/`Select16`, we should be
+; replacing the existing MBB instead of adding a new one.
+;
+; https://github.com/avr-rust/rust/issues/49
+
+; CHECK: LBB0_1:
+; CHECK: LBB0_2:
+; CHECK-NOT: LBB0_3:
+start:
+ br label %bb7.preheader
+
+bb7.preheader: ; preds = %bb10, %start
+ %i = phi i8 [ 0, %start ], [ %j, %bb10 ]
+ %j = phi i8 [ 1, %start ], [ %next, %bb10 ]
+ br label %bb10
+
+bb4: ; preds = %bb10
+ ret void
+
+bb10: ; preds = %bb7.preheader
+ tail call fastcc void @observe(i8 %i, i8 1)
+ %0 = icmp ult i8 %j, 20
+ %1 = zext i1 %0 to i8
+ %next = add i8 %j, %1
+ br i1 %0, label %bb7.preheader, label %bb4
+
+}
+
+declare void @observe(i8, i8);
+
diff --git a/test/CodeGen/Generic/expand-experimental-reductions.ll b/test/CodeGen/Generic/expand-experimental-reductions.ll
new file mode 100644
index 000000000000..ef813fa7205b
--- /dev/null
+++ b/test/CodeGen/Generic/expand-experimental-reductions.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -expand-reductions -S | FileCheck %s
+; Tests without a target which should expand all reductions
+declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>)
+
+declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>)
+
+declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double>)
+declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double>)
+
+
+define i64 @add_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @add_i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0
+; CHECK-NEXT: ret i64 [[TMP0]]
+;
+entry:
+ %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %vec)
+ ret i64 %r
+}
+
+define i64 @mul_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @mul_i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0
+; CHECK-NEXT: ret i64 [[TMP0]]
+;
+entry:
+ %r = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %vec)
+ ret i64 %r
+}
+
+define i64 @and_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @and_i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0
+; CHECK-NEXT: ret i64 [[TMP0]]
+;
+entry:
+ %r = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> %vec)
+ ret i64 %r
+}
+
+define i64 @or_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @or_i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0
+; CHECK-NEXT: ret i64 [[TMP0]]
+;
+entry:
+ %r = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> %vec)
+ ret i64 %r
+}
+
+define i64 @xor_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @xor_i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0
+; CHECK-NEXT: ret i64 [[TMP0]]
+;
+entry:
+ %r = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> %vec)
+ ret i64 %r
+}
+
+define float @fadd_f32(<4 x float> %vec) {
+; CHECK-LABEL: @fadd_f32(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[VEC:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: ret float [[TMP0]]
+;
+entry:
+ %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
+ ret float %r
+}
+
+define float @fadd_f32_strict(<4 x float> %vec) {
+; CHECK-LABEL: @fadd_f32_strict(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]])
+; CHECK-NEXT: ret float [[R]]
+;
+entry:
+ %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
+ ret float %r
+}
+
+define float @fmul_f32(<4 x float> %vec) {
+; CHECK-LABEL: @fmul_f32(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[VEC:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul fast <4 x float> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: ret float [[TMP0]]
+;
+entry:
+ %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
+ ret float %r
+}
+
+define i64 @smax_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @smax_i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT: ret i64 [[TMP0]]
+;
+entry:
+ %r = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> %vec)
+ ret i64 %r
+}
+
+define i64 @smin_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @smin_i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT: ret i64 [[TMP0]]
+;
+entry:
+ %r = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> %vec)
+ ret i64 %r
+}
+
+define i64 @umax_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @umax_i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT: ret i64 [[TMP0]]
+;
+entry:
+ %r = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> %vec)
+ ret i64 %r
+}
+
+define i64 @umin_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @umin_i64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <2 x i64> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT: ret i64 [[TMP0]]
+;
+entry:
+ %r = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> %vec)
+ ret i64 %r
+}
+
+define double @fmax_f64(<2 x double> %vec) {
+; CHECK-LABEL: @fmax_f64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <2 x double> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT: ret double [[TMP0]]
+;
+entry:
+ %r = call double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %vec)
+ ret double %r
+}
+
+define double @fmin_f64(<2 x double> %vec) {
+; CHECK-LABEL: @fmin_f64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <2 x double> [[VEC]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0
+; CHECK-NEXT: ret double [[TMP0]]
+;
+entry:
+ %r = call double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %vec)
+ ret double %r
+}
diff --git a/test/CodeGen/Hexagon/regalloc-bad-undef.mir b/test/CodeGen/Hexagon/regalloc-bad-undef.mir
index d8fbb92b0d50..a541e766f593 100644
--- a/test/CodeGen/Hexagon/regalloc-bad-undef.mir
+++ b/test/CodeGen/Hexagon/regalloc-bad-undef.mir
@@ -161,17 +161,17 @@ body: |
bb.1.for.body:
successors: %bb.3.for.end, %bb.2.if.end82
- ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0
ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29
undef %29.isub_lo = COPY killed %r0
%29.isub_hi = S2_asr_i_r %29.isub_lo, 31
- ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0
ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29
%32.isub_lo = COPY killed %r0
%7 = S2_extractup %32, 22, 9
- ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0
ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29
undef %43.isub_lo = COPY killed %r0
@@ -179,7 +179,7 @@ body: |
%16 = S2_extractup %43, 6, 25
%18 = A2_tfrpi -1
%18 = S2_asl_r_p_acc %18, %47, %16.isub_lo
- ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29
J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3
ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29
%22 = S2_asl_r_p %18, %8.isub_lo
diff --git a/test/CodeGen/Lanai/masking_setccs.ll b/test/CodeGen/Lanai/masking_setccs.ll
new file mode 100644
index 000000000000..48136fd42574
--- /dev/null
+++ b/test/CodeGen/Lanai/masking_setccs.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test that unnecessary masking with 0x1 is not inserted.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "lanai"
+
+; CHECK-LABEL: masking:
+; CHECK-NOT: mov 1
+define i32 @masking(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+entry:
+ %cmp = icmp ne i32 %a, 0
+ %cmp1 = icmp ult i32 %a, %b
+ %or.cond = and i1 %cmp, %cmp1
+ br i1 %or.cond, label %return, label %if.end
+
+if.end: ; preds = %entry
+ %cmp2 = icmp ne i32 %b, 0
+ %cmp4 = icmp ult i32 %b, %c
+ %or.cond29 = and i1 %cmp2, %cmp4
+ br i1 %or.cond29, label %return, label %if.end6
+
+if.end6: ; preds = %if.end
+ %cmp7 = icmp ne i32 %c, 0
+ %cmp9 = icmp ult i32 %c, %d
+ %or.cond30 = and i1 %cmp7, %cmp9
+ br i1 %or.cond30, label %return, label %if.end11
+
+if.end11: ; preds = %if.end6
+ %cmp12 = icmp ne i32 %d, 0
+ %cmp14 = icmp ult i32 %d, %a
+ %or.cond31 = and i1 %cmp12, %cmp14
+ %b. = select i1 %or.cond31, i32 %b, i32 21
+ ret i32 %b.
+
+return: ; preds = %if.end6, %if.end, %entry
+ %retval.0 = phi i32 [ %c, %entry ], [ %d, %if.end ], [ %a, %if.end6 ]
+ ret i32 %retval.0
+}
+
+; CHECK-LABEL: notnot:
+; CHECK-NOT: mov 1
+define i32 @notnot(i32 %x) {
+entry:
+ %tobool = icmp ne i32 %x, 0
+ %lnot.ext = zext i1 %tobool to i32
+ ret i32 %lnot.ext
+}
diff --git a/test/CodeGen/Lanai/peephole-compare.mir b/test/CodeGen/Lanai/peephole-compare.mir
index 5056a05ed1f6..51133b5e58e3 100644
--- a/test/CodeGen/Lanai/peephole-compare.mir
+++ b/test/CodeGen/Lanai/peephole-compare.mir
@@ -644,7 +644,7 @@ body: |
bb.1.if.then:
successors: %bb.2.while.body
- ADJCALLSTACKDOWN 0, implicit-def dead %sp, implicit %sp
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp
CALL @g, csr, implicit-def dead %rca, implicit %sp, implicit-def %sp, implicit-def %rv
ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp
@@ -663,7 +663,7 @@ body: |
bb.4.if.then4:
successors: %bb.5.while.body6
- ADJCALLSTACKDOWN 0, implicit-def dead %sp, implicit %sp
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp
CALL @g, csr, implicit-def dead %rca, implicit %sp, implicit-def %sp, implicit-def %rv
ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp
diff --git a/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir
new file mode 100644
index 000000000000..96801f5b0a37
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir
@@ -0,0 +1,24 @@
+# RUN: llc -mtriple=arm-apple-ios -run-pass=if-converter %s -o - | FileCheck %s
+---
+name: foo
+body: |
+ bb.0:
+ B %bb.2
+
+ bb.1:
+ BX_RET 14, 0
+
+ bb.2:
+ Bcc %bb.1, 1, %cpsr
+
+ bb.3:
+ B %bb.1
+
+...
+
+# We should get a single block containing the BX_RET, with no successors at all
+
+# CHECK: body:
+# CHECK-NEXT: bb.0:
+# CHECK-NEXT: BX_RET
+
diff --git a/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir b/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir
new file mode 100644
index 000000000000..5a1583f7a9be
--- /dev/null
+++ b/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir
@@ -0,0 +1,64 @@
+# RUN: llc -mtriple=arm-apple-ios -o - %s -run-pass if-converter | FileCheck %s
+---
+name: f1
+body: |
+ bb.0:
+ successors: %bb.1
+
+ B %bb.1
+
+ bb.1:
+ successors: %bb.2, %bb.4
+
+ Bcc %bb.4, 1, %cpsr
+
+ bb.2:
+ successors: %bb.3, %bb.5
+
+ Bcc %bb.5, 1, %cpsr
+
+ bb.3:
+ successors: %bb.5
+
+ B %bb.5
+
+ bb.4:
+ successors:
+
+ bb.5:
+ successors: %bb.1, %bb.6
+
+ Bcc %bb.1, 1, %cpsr
+
+ bb.6:
+ BX_RET 14, _
+
+...
+
+# IfConversion.cpp/canFallThroughTo thought there was a fallthrough from
+# bb.4 to bb5 even if the successor list was empty.
+# bb.4 is empty, so it surely looks like it can fallthrough, but this is what
+# happens for a bb just containing an "unreachable".
+
+#CHECK: body: |
+#CHECK: bb.0:
+#CHECK: successors: %bb.1
+
+#CHECK: bb.1:
+#CHECK: successors: %bb.3({{.*}}), %bb.2
+
+# The original brr_cond from bb.1, jumping to the empty bb
+#CHECK: Bcc %bb.2
+#CHECK: B %bb.3
+
+# Empty bb.2, originally containing "unreachable" and thus has no successors
+#CHECK: bb.2:
+#CHECK-NOT: successors
+
+#CHECK: bb.3:
+#CHECK: successors: %bb.1
+
+# Conditional BX_RET and then loop back to bb.1
+#CHECK: BX_RET 0
+#CHECK: B %bb.1
+
diff --git a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
index 2d5347e5d30d..14bb5db5a51d 100644
--- a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
+++ b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir
@@ -60,7 +60,7 @@ body: |
liveins: %eax
MOV32mr %stack.0.tmp, 1, _, 0, _, killed %eax
- ADJCALLSTACKDOWN64 0, 0, implicit-def %rsp, implicit-def dead %eflags, implicit %rsp
+ ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def dead %eflags, implicit %rsp
%rsi = LEA64r %stack.0.tmp, 1, _, 0, _
%edi = MOV32r0 implicit-def dead %eflags
CALL64pcrel32 @doSomething, csr_64, implicit %rsp, implicit %edi, implicit %rsi, implicit-def %rsp, implicit-def %eax
diff --git a/test/CodeGen/MSP430/hwmult16.ll b/test/CodeGen/MSP430/hwmult16.ll
new file mode 100644
index 000000000000..b23f1ad37d81
--- /dev/null
+++ b/test/CodeGen/MSP430/hwmult16.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 -mhwmult=16bit < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+@g_i32 = global i32 123, align 8
+@g_i64 = global i64 456, align 8
+@g_i16 = global i16 789, align 8
+
+define i16 @mpyi() #0 {
+entry:
+; CHECK: mpyi:
+
+; CHECK: call #__mspabi_mpyi_hw
+ %0 = load volatile i16, i16* @g_i16, align 8
+ %1 = mul i16 %0, %0
+
+ ret i16 %1
+}
+
+define i32 @mpyli() #0 {
+entry:
+; CHECK: mpyli:
+
+; CHECK: call #__mspabi_mpyl_hw
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = mul i32 %0, %0
+
+ ret i32 %1
+}
+
+define i64 @mpylli() #0 {
+entry:
+; CHECK: mpylli:
+
+; CHECK: call #__mspabi_mpyll_hw
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = mul i64 %0, %0
+
+ ret i64 %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/MSP430/hwmult32.ll b/test/CodeGen/MSP430/hwmult32.ll
new file mode 100644
index 000000000000..6ffeb9698862
--- /dev/null
+++ b/test/CodeGen/MSP430/hwmult32.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 -mhwmult=32bit < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+@g_i32 = global i32 123, align 8
+@g_i64 = global i64 456, align 8
+@g_i16 = global i16 789, align 8
+
+define i16 @mpyi() #0 {
+entry:
+; CHECK: mpyi:
+
+; CHECK: call #__mspabi_mpyi_hw
+ %0 = load volatile i16, i16* @g_i16, align 8
+ %1 = mul i16 %0, %0
+
+ ret i16 %1
+}
+
+define i32 @mpyli() #0 {
+entry:
+; CHECK: mpyli:
+
+; CHECK: call #__mspabi_mpyl_hw32
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = mul i32 %0, %0
+
+ ret i32 %1
+}
+
+define i64 @mpylli() #0 {
+entry:
+; CHECK: mpylli:
+
+; CHECK: call #__mspabi_mpyll_hw32
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = mul i64 %0, %0
+
+ ret i64 %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/MSP430/hwmultf5.ll b/test/CodeGen/MSP430/hwmultf5.ll
new file mode 100644
index 000000000000..51ca4be4a654
--- /dev/null
+++ b/test/CodeGen/MSP430/hwmultf5.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 -mhwmult=f5series < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+@g_i32 = global i32 123, align 8
+@g_i64 = global i64 456, align 8
+@g_i16 = global i16 789, align 8
+
+define i16 @mpyi() #0 {
+entry:
+; CHECK: mpyi:
+
+; CHECK: call #__mspabi_mpyi_f5hw
+ %0 = load volatile i16, i16* @g_i16, align 8
+ %1 = mul i16 %0, %0
+
+ ret i16 %1
+}
+
+define i32 @mpyli() #0 {
+entry:
+; CHECK: mpyli:
+
+; CHECK: call #__mspabi_mpyl_f5hw
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = mul i32 %0, %0
+
+ ret i32 %1
+}
+
+define i64 @mpylli() #0 {
+entry:
+; CHECK: mpylli:
+
+; CHECK: call #__mspabi_mpyll_f5hw
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = mul i64 %0, %0
+
+ ret i64 %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/MSP430/jumptable.ll b/test/CodeGen/MSP430/jumptable.ll
index 5ccdbb701db1..b4366251698b 100644
--- a/test/CodeGen/MSP430/jumptable.ll
+++ b/test/CodeGen/MSP430/jumptable.ll
@@ -12,7 +12,7 @@ entry:
store i16 %i, i16* %i.addr, align 2
%0 = load i16, i16* %i.addr, align 2
; CHECK: mov.w #2, r13
-; CHECK: call #__mulhi3hw_noint
+; CHECK: call #__mspabi_mpyi
; CHECK: br .LJTI0_0(r12)
switch i16 %0, label %sw.default [
i16 0, label %sw.bb
diff --git a/test/CodeGen/MSP430/libcalls.ll b/test/CodeGen/MSP430/libcalls.ll
new file mode 100644
index 000000000000..950ed6c17e2c
--- /dev/null
+++ b/test/CodeGen/MSP430/libcalls.ll
@@ -0,0 +1,595 @@
+; RUN: llc -O0 < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+@g_double = global double 123.0, align 8
+@g_float = global float 123.0, align 8
+@g_i32 = global i32 123, align 8
+@g_i64 = global i64 456, align 8
+@g_i16 = global i16 789, align 8
+
+define float @d2f() #0 {
+entry:
+; CHECK: d2f:
+
+; CHECK: call #__mspabi_cvtdf
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fptrunc double %0 to float
+
+ ret float %1
+}
+
+define double @f2d() #0 {
+entry:
+; CHECK: f2d:
+
+; CHECK: call #__mspabi_cvtfd
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fpext float %0 to double
+
+ ret double %1
+}
+
+define i32 @d2l() #0 {
+entry:
+; CHECK: d2l:
+
+; CHECK: call #__mspabi_fixdli
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fptosi double %0 to i32
+
+ ret i32 %1
+}
+
+define i64 @d2ll() #0 {
+entry:
+; CHECK: d2ll:
+
+; CHECK: call #__mspabi_fixdlli
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fptosi double %0 to i64
+
+ ret i64 %1
+}
+
+define i32 @d2ul() #0 {
+entry:
+; CHECK: d2ul:
+
+; CHECK: call #__mspabi_fixdul
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fptoui double %0 to i32
+
+ ret i32 %1
+}
+
+define i64 @d2ull() #0 {
+entry:
+; CHECK: d2ull:
+
+; CHECK: call #__mspabi_fixdull
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fptoui double %0 to i64
+
+ ret i64 %1
+}
+
+define i32 @f2l() #0 {
+entry:
+; CHECK: f2l:
+
+; CHECK: call #__mspabi_fixfli
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fptosi float %0 to i32
+
+ ret i32 %1
+}
+
+define i64 @f2ll() #0 {
+entry:
+; CHECK: f2ll:
+
+; CHECK: call #__mspabi_fixflli
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fptosi float %0 to i64
+
+ ret i64 %1
+}
+
+define i32 @f2ul() #0 {
+entry:
+; CHECK: f2ul:
+
+; CHECK: call #__mspabi_fixful
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fptoui float %0 to i32
+
+ ret i32 %1
+}
+
+define i64 @f2ull() #0 {
+entry:
+; CHECK: f2ull:
+
+; CHECK: call #__mspabi_fixfull
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fptoui float %0 to i64
+
+ ret i64 %1
+}
+
+define double @l2d() #0 {
+entry:
+; CHECK: l2d:
+
+; CHECK: call #__mspabi_fltlid
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = sitofp i32 %0 to double
+
+ ret double %1
+}
+
+define double @ll2d() #0 {
+entry:
+; CHECK: ll2d:
+
+; CHECK: call #__mspabi_fltllid
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = sitofp i64 %0 to double
+
+ ret double %1
+}
+
+define double @ul2d() #0 {
+entry:
+; CHECK: ul2d:
+
+; CHECK: call #__mspabi_fltuld
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = uitofp i32 %0 to double
+
+ ret double %1
+}
+
+define double @ull2d() #0 {
+entry:
+; CHECK: ull2d:
+
+; CHECK: call #__mspabi_fltulld
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = uitofp i64 %0 to double
+
+ ret double %1
+}
+
+define float @l2f() #0 {
+entry:
+; CHECK: l2f:
+
+; CHECK: call #__mspabi_fltlif
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = sitofp i32 %0 to float
+
+ ret float %1
+}
+
+define float @ll2f() #0 {
+entry:
+; CHECK: ll2f:
+
+; CHECK: call #__mspabi_fltllif
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = sitofp i64 %0 to float
+
+ ret float %1
+}
+
+define float @ul2f() #0 {
+entry:
+; CHECK: ul2f:
+
+; CHECK: call #__mspabi_fltulf
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = uitofp i32 %0 to float
+
+ ret float %1
+}
+
+define float @ull2f() #0 {
+entry:
+; CHECK: ull2f:
+
+; CHECK: call #__mspabi_fltullf
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = uitofp i64 %0 to float
+
+ ret float %1
+}
+
+define i1 @cmpd_oeq() #0 {
+entry:
+; CHECK: cmpd_oeq:
+
+; CHECK: call #__mspabi_cmpd
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fcmp oeq double %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpd_une() #0 {
+entry:
+; CHECK: cmpd_une:
+
+; CHECK: call #__mspabi_cmpd
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fcmp une double %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpd_oge() #0 {
+entry:
+; CHECK: cmpd_oge:
+
+; CHECK: call #__mspabi_cmpd
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fcmp oge double %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpd_olt() #0 {
+entry:
+; CHECK: cmpd_olt:
+
+; CHECK: call #__mspabi_cmpd
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fcmp olt double %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpd_ole() #0 {
+entry:
+; CHECK: cmpd_ole:
+
+; CHECK: call #__mspabi_cmpd
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fcmp ole double %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpd_ogt() #0 {
+entry:
+; CHECK: cmpd_ogt:
+
+; CHECK: call #__mspabi_cmpd
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fcmp ogt double %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpf_oeq() #0 {
+entry:
+; CHECK: cmpf_oeq:
+
+; CHECK: call #__mspabi_cmpf
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fcmp oeq float %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpf_une() #0 {
+entry:
+; CHECK: cmpf_une:
+
+; CHECK: call #__mspabi_cmpf
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fcmp une float %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpf_oge() #0 {
+entry:
+; CHECK: cmpf_oge:
+
+; CHECK: call #__mspabi_cmpf
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fcmp oge float %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpf_olt() #0 {
+entry:
+; CHECK: cmpf_olt:
+
+; CHECK: call #__mspabi_cmpf
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fcmp olt float %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpf_ole() #0 {
+entry:
+; CHECK: cmpf_ole:
+
+; CHECK: call #__mspabi_cmpf
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fcmp ole float %0, 123.0
+
+ ret i1 %1
+}
+
+define i1 @cmpf_ogt() #0 {
+entry:
+; CHECK: cmpf_ogt:
+
+; CHECK: call #__mspabi_cmpf
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fcmp ogt float %0, 123.0
+
+ ret i1 %1
+}
+
+define double @addd() #0 {
+entry:
+; CHECK: addd:
+
+; CHECK: call #__mspabi_addd
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fadd double %0, 123.0
+
+ ret double %1
+}
+
+define float @addf() #0 {
+entry:
+; CHECK: addf:
+
+; CHECK: call #__mspabi_addf
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fadd float %0, 123.0
+
+ ret float %1
+}
+
+define double @divd() #0 {
+entry:
+; CHECK: divd:
+
+; CHECK: call #__mspabi_divd
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fdiv double %0, 123.0
+
+ ret double %1
+}
+
+define float @divf() #0 {
+entry:
+; CHECK: divf:
+
+; CHECK: call #__mspabi_divf
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fdiv float %0, 123.0
+
+ ret float %1
+}
+
+define double @mpyd() #0 {
+entry:
+; CHECK: mpyd:
+
+; CHECK: call #__mspabi_mpyd
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fmul double %0, 123.0
+
+ ret double %1
+}
+
+define float @mpyf() #0 {
+entry:
+; CHECK: mpyf:
+
+; CHECK: call #__mspabi_mpyf
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fmul float %0, 123.0
+
+ ret float %1
+}
+
+define double @subd() #0 {
+entry:
+; CHECK: subd:
+
+; CHECK: call #__mspabi_subd
+ %0 = load volatile double, double* @g_double, align 8
+ %1 = fsub double %0, %0
+
+ ret double %1
+}
+
+define float @subf() #0 {
+entry:
+; CHECK: subf:
+
+; CHECK: call #__mspabi_subf
+ %0 = load volatile float, float* @g_float, align 8
+ %1 = fsub float %0, %0
+
+ ret float %1
+}
+
+define i16 @divi() #0 {
+entry:
+; CHECK: divi:
+
+; CHECK: call #__mspabi_divi
+ %0 = load volatile i16, i16* @g_i16, align 8
+ %1 = sdiv i16 %0, %0
+
+ ret i16 %1
+}
+
+define i32 @divli() #0 {
+entry:
+; CHECK: divli:
+
+; CHECK: call #__mspabi_divli
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = sdiv i32 %0, %0
+
+ ret i32 %1
+}
+
+define i64 @divlli() #0 {
+entry:
+; CHECK: divlli:
+
+; CHECK: call #__mspabi_divlli
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = sdiv i64 %0, %0
+
+ ret i64 %1
+}
+
+define i16 @divu() #0 {
+entry:
+; CHECK: divu:
+
+; CHECK: call #__mspabi_divu
+ %0 = load volatile i16, i16* @g_i16, align 8
+ %1 = udiv i16 %0, %0
+
+ ret i16 %1
+}
+
+define i32 @divul() #0 {
+entry:
+; CHECK: divul:
+
+; CHECK: call #__mspabi_divul
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = udiv i32 %0, %0
+
+ ret i32 %1
+}
+
+define i64 @divull() #0 {
+entry:
+; CHECK: divull:
+
+; CHECK: call #__mspabi_divull
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = udiv i64 %0, %0
+
+ ret i64 %1
+}
+
+define i16 @remi() #0 {
+entry:
+; CHECK: remi:
+
+; CHECK: call #__mspabi_remi
+ %0 = load volatile i16, i16* @g_i16, align 8
+ %1 = srem i16 %0, %0
+
+ ret i16 %1
+}
+
+define i32 @remli() #0 {
+entry:
+; CHECK: remli:
+
+; CHECK: call #__mspabi_remli
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = srem i32 %0, %0
+
+ ret i32 %1
+}
+
+define i64 @remlli() #0 {
+entry:
+; CHECK: remlli:
+
+; CHECK: call #__mspabi_remlli
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = srem i64 %0, %0
+
+ ret i64 %1
+}
+
+define i16 @remu() #0 {
+entry:
+; CHECK: remu:
+
+; CHECK: call #__mspabi_remu
+ %0 = load volatile i16, i16* @g_i16, align 8
+ %1 = urem i16 %0, %0
+
+ ret i16 %1
+}
+
+define i32 @remul() #0 {
+entry:
+; CHECK: remul:
+
+; CHECK: call #__mspabi_remul
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = urem i32 %0, %0
+
+ ret i32 %1
+}
+
+define i64 @remull() #0 {
+entry:
+; CHECK: remull:
+
+; CHECK: call #__mspabi_remull
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = urem i64 %0, %0
+
+ ret i64 %1
+}
+
+define i16 @mpyi() #0 {
+entry:
+; CHECK: mpyi:
+
+; CHECK: call #__mspabi_mpyi
+ %0 = load volatile i16, i16* @g_i16, align 8
+ %1 = mul i16 %0, %0
+
+ ret i16 %1
+}
+
+define i32 @mpyli() #0 {
+entry:
+; CHECK: mpyli:
+
+; CHECK: call #__mspabi_mpyl
+ %0 = load volatile i32, i32* @g_i32, align 8
+ %1 = mul i32 %0, %0
+
+ ret i32 %1
+}
+
+define i64 @mpylli() #0 {
+entry:
+; CHECK: mpylli:
+
+; CHECK: call #__mspabi_mpyll
+ %0 = load volatile i64, i64* @g_i64, align 8
+ %1 = mul i64 %0, %0
+
+ ret i64 %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll b/test/CodeGen/MSP430/promote-i8-mul.ll
index dce9d25ca87a..0e05e3978b1e 100644
--- a/test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll
+++ b/test/CodeGen/MSP430/promote-i8-mul.ll
@@ -8,7 +8,7 @@ target triple = "msp430-elf"
define signext i8 @foo(i8 signext %_si1, i8 signext %_si2) nounwind readnone {
entry:
; CHECK-LABEL: foo:
-; CHECK: call #__mulqi3
+; CHECK: call #__mspabi_mpyi
%mul = mul i8 %_si2, %_si1 ; <i8> [#uses=1]
ret i8 %mul
}
diff --git a/test/CodeGen/NVPTX/bug17709.ll b/test/CodeGen/NVPTX/bug17709.ll
index 076c44684579..6d747f09d8a7 100644
--- a/test/CodeGen/NVPTX/bug17709.ll
+++ b/test/CodeGen/NVPTX/bug17709.ll
@@ -1,26 +1,26 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
-; ModuleID = '__kernelgen_main_module'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
-
-define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {
-entry:
- ;unreachable
- %t0 = insertvalue {double, double} undef, double 1.0, 0
- %t1 = insertvalue {double, double} %t0, double 1.0, 1
- ret { double, double } %t1
-}
-
-%struct.descriptor_dimension.0.52 = type { i64, i64, i64 }
-%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
-%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
-@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096
-
-; CHECK: .visible .entry __kernelgen_main
-define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) {
-entry:
- %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8)
- ret void
-}
-
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+; ModuleID = '__kernelgen_main_module'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {
+entry:
+ ;unreachable
+ %t0 = insertvalue {double, double} undef, double 1.0, 0
+ %t1 = insertvalue {double, double} %t0, double 1.0, 1
+ ret { double, double } %t1
+}
+
+%struct.descriptor_dimension.0.52 = type { i64, i64, i64 }
+%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096
+
+; CHECK: .visible .entry __kernelgen_main
+define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) {
+entry:
+ %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8)
+ ret void
+}
+
diff --git a/test/CodeGen/NVPTX/ctlz.ll b/test/CodeGen/NVPTX/ctlz.ll
index 005958bd938a..7aa29fe811dd 100644
--- a/test/CodeGen/NVPTX/ctlz.ll
+++ b/test/CodeGen/NVPTX/ctlz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
diff --git a/test/CodeGen/NVPTX/ctpop.ll b/test/CodeGen/NVPTX/ctpop.ll
index b961d4d27bdd..69a4f879a8d8 100644
--- a/test/CodeGen/NVPTX/ctpop.ll
+++ b/test/CodeGen/NVPTX/ctpop.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
diff --git a/test/CodeGen/NVPTX/cttz.ll b/test/CodeGen/NVPTX/cttz.ll
index 124ba9d1e9a7..0bfe0139bcdf 100644
--- a/test/CodeGen/NVPTX/cttz.ll
+++ b/test/CodeGen/NVPTX/cttz.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
diff --git a/test/CodeGen/NVPTX/f16-instructions.ll b/test/CodeGen/NVPTX/f16-instructions.ll
index 3d4140820794..08a2ee14e8bd 100644
--- a/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/test/CodeGen/NVPTX/f16-instructions.ll
@@ -1,1078 +1,1079 @@
-; ## Full FP16 support enabled by default.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
-; RUN: -O0 -disable-post-ra -disable-fp-elim \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
-; ## FP16 support explicitly disabled.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
-; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
-; ## FP16 is not supported by hardware.
-; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
-; RUN: -disable-post-ra -disable-fp-elim \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-; CHECK-LABEL: test_ret_const(
-; CHECK: mov.b16 [[R:%h[0-9]+]], 0x3C00;
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_ret_const() #0 {
- ret half 1.0
-}
-
-; CHECK-LABEL: test_fadd(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_param_1];
-; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fadd(half %a, half %b) #0 {
- %r = fadd half %a, %b
- ret half %r
-}
-
-; CHECK-LABEL: test_fadd_v1f16(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_v1f16_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_v1f16_param_1];
-; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 {
- %r = fadd <1 x half> %a, %b
- ret <1 x half> %r
-}
-
-; Check that we can lower fadd with immediate arguments.
-; CHECK-LABEL: test_fadd_imm_0(
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_0_param_0];
-; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00;
-; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fadd_imm_0(half %b) #0 {
- %r = fadd half 1.0, %b
- ret half %r
-}
-
-; CHECK-LABEL: test_fadd_imm_1(
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_1_param_0];
-; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00;
-; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fadd_imm_1(half %a) #0 {
- %r = fadd half %a, 1.0
- ret half %r
-}
-
-; CHECK-LABEL: test_fsub(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fsub_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fsub_param_1];
-; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fsub(half %a, half %b) #0 {
- %r = fsub half %a, %b
- ret half %r
-}
-
-; CHECK-LABEL: test_fneg(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fneg_param_0];
-; CHECK-F16-NEXT: mov.b16 [[Z:%h[0-9]+]], 0x0000
-; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[Z]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000;
-; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[Z]], [[A32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fneg(half %a) #0 {
- %r = fsub half 0.0, %a
- ret half %r
-}
-
-; CHECK-LABEL: test_fmul(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmul_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmul_param_1];
-; CHECK-F16-NEXT: mul.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-NEXT: mul.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fmul(half %a, half %b) #0 {
- %r = fmul half %a, %b
- ret half %r
-}
-
-; CHECK-LABEL: test_fdiv(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fdiv_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fdiv_param_1];
-; CHECK-DAG: cvt.f32.f16 [[F0:%f[0-9]+]], [[A]];
-; CHECK-DAG: cvt.f32.f16 [[F1:%f[0-9]+]], [[B]];
-; CHECK-NEXT: div.rn.f32 [[FR:%f[0-9]+]], [[F0]], [[F1]];
-; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[FR]];
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_fdiv(half %a, half %b) #0 {
- %r = fdiv half %a, %b
- ret half %r
-}
-
-; CHECK-LABEL: test_frem(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_frem_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_frem_param_1];
-; CHECK-DAG: cvt.f32.f16 [[FA:%f[0-9]+]], [[A]];
-; CHECK-DAG: cvt.f32.f16 [[FB:%f[0-9]+]], [[B]];
-; CHECK-NEXT: div.rn.f32 [[D:%f[0-9]+]], [[FA]], [[FB]];
-; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]];
-; CHECK-NEXT: mul.f32 [[RI:%f[0-9]+]], [[DI]], [[FB]];
-; CHECK-NEXT: sub.f32 [[RF:%f[0-9]+]], [[FA]], [[RI]];
-; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_frem(half %a, half %b) #0 {
- %r = frem half %a, %b
- ret half %r
-}
-
-; CHECK-LABEL: test_store(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_store_param_0];
-; CHECK-DAG: ld.param.u64 %[[PTR:rd[0-9]+]], [test_store_param_1];
-; CHECK-NEXT: st.b16 [%[[PTR]]], [[A]];
-; CHECK-NEXT: ret;
-define void @test_store(half %a, half* %b) #0 {
- store half %a, half* %b
- ret void
-}
-
-; CHECK-LABEL: test_load(
-; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [test_load_param_0];
-; CHECK-NEXT: ld.b16 [[R:%h[0-9]+]], [%[[PTR]]];
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_load(half* %a) #0 {
- %r = load half, half* %a
- ret half %r
-}
-
-; CHECK-LABEL: .visible .func test_halfp0a1(
-; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0];
-; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1];
-; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
-; CHECK-DAG: st.u8 [%[[TO]]], [[B0]]
-; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
-; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]]
-; CHECK: ret
-define void @test_halfp0a1(half * noalias readonly %from, half * %to) {
- %1 = load half, half * %from , align 1
- store half %1, half * %to , align 1
- ret void
-}
-
-declare half @test_callee(half %a, half %b) #0
-
-; CHECK-LABEL: test_call(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_param_1];
-; CHECK: {
-; CHECK-DAG: .param .b32 param0;
-; CHECK-DAG: .param .b32 param1;
-; CHECK-DAG: st.param.b16 [param0+0], [[A]];
-; CHECK-DAG: st.param.b16 [param1+0], [[B]];
-; CHECK-DAG: .param .b32 retval0;
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_callee,
-; CHECK: );
-; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_call(half %a, half %b) #0 {
- %r = call half @test_callee(half %a, half %b)
- ret half %r
-}
-
-; CHECK-LABEL: test_call_flipped(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_flipped_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_flipped_param_1];
-; CHECK: {
-; CHECK-DAG: .param .b32 param0;
-; CHECK-DAG: .param .b32 param1;
-; CHECK-DAG: st.param.b16 [param0+0], [[B]];
-; CHECK-DAG: st.param.b16 [param1+0], [[A]];
-; CHECK-DAG: .param .b32 retval0;
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_callee,
-; CHECK: );
-; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_call_flipped(half %a, half %b) #0 {
- %r = call half @test_callee(half %b, half %a)
- ret half %r
-}
-
-; CHECK-LABEL: test_tailcall_flipped(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_tailcall_flipped_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_tailcall_flipped_param_1];
-; CHECK: {
-; CHECK-DAG: .param .b32 param0;
-; CHECK-DAG: .param .b32 param1;
-; CHECK-DAG: st.param.b16 [param0+0], [[B]];
-; CHECK-DAG: st.param.b16 [param1+0], [[A]];
-; CHECK-DAG: .param .b32 retval0;
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_callee,
-; CHECK: );
-; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_tailcall_flipped(half %a, half %b) #0 {
- %r = tail call half @test_callee(half %b, half %a)
- ret half %r
-}
-
-; CHECK-LABEL: test_select(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1];
-; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
-; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
- %r = select i1 %c, half %a, half %b
- ret half %r
-}
-
-; CHECK-LABEL: test_select_cc(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_param_1];
-; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_param_2];
-; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_param_3];
-; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
-; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]]
-; CHECK: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
- %cc = fcmp une half %c, %d
- %r = select i1 %cc, half %a, half %b
- ret half %r
-}
-
-; CHECK-LABEL: test_select_cc_f32_f16(
-; CHECK-DAG: ld.param.f32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
-; CHECK-DAG: ld.param.f32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
-; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_f32_f16_param_2];
-; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_f32_f16_param_3];
-; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
-; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]]
-; CHECK-NEXT: selp.f32 [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.f32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
- %cc = fcmp une half %c, %d
- %r = select i1 %cc, float %a, float %b
- ret float %r
-}
-
-; CHECK-LABEL: test_select_cc_f16_f32(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_f16_f32_param_0];
-; CHECK-DAG: ld.param.f32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
-; CHECK-DAG: ld.param.f32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
-; CHECK-DAG: setp.neu.f32 [[PRED:%p[0-9]+]], [[C]], [[D]]
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_f16_f32_param_1];
-; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
- %cc = fcmp une float %c, %d
- %r = select i1 %cc, half %a, half %b
- ret half %r
-}
-
-; CHECK-LABEL: test_fcmp_une(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_une_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_une_param_1];
-; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_une(half %a, half %b) #0 {
- %r = fcmp une half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ueq(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ueq_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ueq_param_1];
-; CHECK-F16: setp.equ.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.equ.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ueq(half %a, half %b) #0 {
- %r = fcmp ueq half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ugt(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ugt_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ugt_param_1];
-; CHECK-F16: setp.gtu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.gtu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ugt(half %a, half %b) #0 {
- %r = fcmp ugt half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_uge(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uge_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uge_param_1];
-; CHECK-F16: setp.geu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.geu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_uge(half %a, half %b) #0 {
- %r = fcmp uge half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ult(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ult_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ult_param_1];
-; CHECK-F16: setp.ltu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.ltu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ult(half %a, half %b) #0 {
- %r = fcmp ult half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ule(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ule_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ule_param_1];
-; CHECK-F16: setp.leu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.leu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ule(half %a, half %b) #0 {
- %r = fcmp ule half %a, %b
- ret i1 %r
-}
-
-
-; CHECK-LABEL: test_fcmp_uno(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uno_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uno_param_1];
-; CHECK-F16: setp.nan.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.nan.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_uno(half %a, half %b) #0 {
- %r = fcmp uno half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_one(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_one_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_one_param_1];
-; CHECK-F16: setp.ne.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.ne.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_one(half %a, half %b) #0 {
- %r = fcmp one half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_oeq(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oeq_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oeq_param_1];
-; CHECK-F16: setp.eq.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.eq.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_oeq(half %a, half %b) #0 {
- %r = fcmp oeq half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ogt(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ogt_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ogt_param_1];
-; CHECK-F16: setp.gt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.gt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ogt(half %a, half %b) #0 {
- %r = fcmp ogt half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_oge(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oge_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oge_param_1];
-; CHECK-F16: setp.ge.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.ge.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_oge(half %a, half %b) #0 {
- %r = fcmp oge half %a, %b
- ret i1 %r
-}
-
-; XCHECK-LABEL: test_fcmp_olt(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_olt_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_olt_param_1];
-; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_olt(half %a, half %b) #0 {
- %r = fcmp olt half %a, %b
- ret i1 %r
-}
-
-; XCHECK-LABEL: test_fcmp_ole(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ole_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ole_param_1];
-; CHECK-F16: setp.le.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.le.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ole(half %a, half %b) #0 {
- %r = fcmp ole half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_fcmp_ord(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ord_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ord_param_1];
-; CHECK-F16: setp.num.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.num.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i1 @test_fcmp_ord(half %a, half %b) #0 {
- %r = fcmp ord half %a, %b
- ret i1 %r
-}
-
-; CHECK-LABEL: test_br_cc(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_br_cc_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_br_cc_param_1];
-; CHECK-DAG: ld.param.u64 %[[C:rd[0-9]+]], [test_br_cc_param_2];
-; CHECK-DAG: ld.param.u64 %[[D:rd[0-9]+]], [test_br_cc_param_3];
-; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: @[[PRED]] bra [[LABEL:LBB.*]];
-; CHECK: st.u32 [%[[C]]],
-; CHECK: [[LABEL]]:
-; CHECK: st.u32 [%[[D]]],
-; CHECK: ret;
-define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 {
- %c = fcmp uge half %a, %b
- br i1 %c, label %then, label %else
-then:
- store i32 0, i32* %p1
- ret void
-else:
- store i32 0, i32* %p2
- ret void
-}
-
-; CHECK-LABEL: test_phi(
-; CHECK: ld.param.u64 %[[P1:rd[0-9]+]], [test_phi_param_0];
-; CHECK: ld.b16 {{%h[0-9]+}}, [%[[P1]]];
-; CHECK: [[LOOP:LBB[0-9_]+]]:
-; CHECK: mov.b16 [[R:%h[0-9]+]], [[AB:%h[0-9]+]];
-; CHECK: ld.b16 [[AB:%h[0-9]+]], [%[[P1]]];
-; CHECK: {
-; CHECK: st.param.b64 [param0+0], %[[P1]];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_dummy
-; CHECK: }
-; CHECK: setp.eq.b32 [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 1;
-; CHECK: @[[PRED]] bra [[LOOP]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_phi(half* %p1) #0 {
-entry:
- %a = load half, half* %p1
- br label %loop
-loop:
- %r = phi half [%a, %entry], [%b, %loop]
- %b = load half, half* %p1
- %c = call i1 @test_dummy(half* %p1)
- br i1 %c, label %loop, label %return
-return:
- ret half %r
-}
-declare i1 @test_dummy(half* %p1) #0
-
-; CHECK-LABEL: test_fptosi_i32(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i32_param_0];
-; CHECK: cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]];
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define i32 @test_fptosi_i32(half %a) #0 {
- %r = fptosi half %a to i32
- ret i32 %r
-}
-
-; CHECK-LABEL: test_fptosi_i64(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i64_param_0];
-; CHECK: cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]];
-; CHECK: st.param.b64 [func_retval0+0], [[R]];
-; CHECK: ret;
-define i64 @test_fptosi_i64(half %a) #0 {
- %r = fptosi half %a to i64
- ret i64 %r
-}
-
-; CHECK-LABEL: test_fptoui_i32(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i32_param_0];
-; CHECK: cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]];
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define i32 @test_fptoui_i32(half %a) #0 {
- %r = fptoui half %a to i32
- ret i32 %r
-}
-
-; CHECK-LABEL: test_fptoui_i64(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i64_param_0];
-; CHECK: cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]];
-; CHECK: st.param.b64 [func_retval0+0], [[R]];
-; CHECK: ret;
-define i64 @test_fptoui_i64(half %a) #0 {
- %r = fptoui half %a to i64
- ret i64 %r
-}
-
-; CHECK-LABEL: test_uitofp_i32(
-; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0];
-; CHECK: cvt.rn.f16.u32 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_uitofp_i32(i32 %a) #0 {
- %r = uitofp i32 %a to half
- ret half %r
-}
-
-; CHECK-LABEL: test_uitofp_i64(
-; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0];
-; CHECK: cvt.rn.f16.u64 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_uitofp_i64(i64 %a) #0 {
- %r = uitofp i64 %a to half
- ret half %r
-}
-
-; CHECK-LABEL: test_sitofp_i32(
-; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0];
-; CHECK: cvt.rn.f16.s32 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_sitofp_i32(i32 %a) #0 {
- %r = sitofp i32 %a to half
- ret half %r
-}
-
-; CHECK-LABEL: test_sitofp_i64(
-; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0];
-; CHECK: cvt.rn.f16.s64 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_sitofp_i64(i64 %a) #0 {
- %r = sitofp i64 %a to half
- ret half %r
-}
-
-; CHECK-LABEL: test_uitofp_i32_fadd(
-; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0];
-; CHECK-DAG: cvt.rn.f16.u32 [[C:%h[0-9]+]], [[A]];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_uitofp_i32_fadd_param_1];
-; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
- %c = uitofp i32 %a to half
- %r = fadd half %b, %c
- ret half %r
-}
-
-; CHECK-LABEL: test_sitofp_i32_fadd(
-; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0];
-; CHECK-DAG: cvt.rn.f16.s32 [[C:%h[0-9]+]], [[A]];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_sitofp_i32_fadd_param_1];
-; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]];
-; XCHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
-; XCHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
-; XCHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]];
-; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
- %c = sitofp i32 %a to half
- %r = fadd half %b, %c
- ret half %r
-}
-
-; CHECK-LABEL: test_fptrunc_float(
-; CHECK: ld.param.f32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
-; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_fptrunc_float(float %a) #0 {
- %r = fptrunc float %a to half
- ret half %r
-}
-
-; CHECK-LABEL: test_fptrunc_double(
-; CHECK: ld.param.f64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
-; CHECK: cvt.rn.f16.f64 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_fptrunc_double(double %a) #0 {
- %r = fptrunc double %a to half
- ret half %r
-}
-
-; CHECK-LABEL: test_fpext_float(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_float_param_0];
-; CHECK: cvt.f32.f16 [[R:%f[0-9]+]], [[A]];
-; CHECK: st.param.f32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define float @test_fpext_float(half %a) #0 {
- %r = fpext half %a to float
- ret float %r
-}
-
-; CHECK-LABEL: test_fpext_double(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_double_param_0];
-; CHECK: cvt.f64.f16 [[R:%fd[0-9]+]], [[A]];
-; CHECK: st.param.f64 [func_retval0+0], [[R]];
-; CHECK: ret;
-define double @test_fpext_double(half %a) #0 {
- %r = fpext half %a to double
- ret double %r
-}
-
-
-; CHECK-LABEL: test_bitcast_halftoi16(
-; CHECK: ld.param.b16 [[AH:%h[0-9]+]], [test_bitcast_halftoi16_param_0];
-; CHECK: mov.b16 [[AS:%rs[0-9]+]], [[AH]]
-; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[AS]]
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define i16 @test_bitcast_halftoi16(half %a) #0 {
- %r = bitcast half %a to i16
- ret i16 %r
-}
-
-; CHECK-LABEL: test_bitcast_i16tohalf(
-; CHECK: ld.param.u16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0];
-; CHECK: mov.b16 [[AH:%h[0-9]+]], [[AS]]
-; CHECK: st.param.b16 [func_retval0+0], [[AH]];
-; CHECK: ret;
-define half @test_bitcast_i16tohalf(i16 %a) #0 {
- %r = bitcast i16 %a to half
- ret half %r
-}
-
-
-declare half @llvm.sqrt.f16(half %a) #0
-declare half @llvm.powi.f16(half %a, i32 %b) #0
-declare half @llvm.sin.f16(half %a) #0
-declare half @llvm.cos.f16(half %a) #0
-declare half @llvm.pow.f16(half %a, half %b) #0
-declare half @llvm.exp.f16(half %a) #0
-declare half @llvm.exp2.f16(half %a) #0
-declare half @llvm.log.f16(half %a) #0
-declare half @llvm.log10.f16(half %a) #0
-declare half @llvm.log2.f16(half %a) #0
-declare half @llvm.fma.f16(half %a, half %b, half %c) #0
-declare half @llvm.fabs.f16(half %a) #0
-declare half @llvm.minnum.f16(half %a, half %b) #0
-declare half @llvm.maxnum.f16(half %a, half %b) #0
-declare half @llvm.copysign.f16(half %a, half %b) #0
-declare half @llvm.floor.f16(half %a) #0
-declare half @llvm.ceil.f16(half %a) #0
-declare half @llvm.trunc.f16(half %a) #0
-declare half @llvm.rint.f16(half %a) #0
-declare half @llvm.nearbyint.f16(half %a) #0
-declare half @llvm.round.f16(half %a) #0
-declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
-
-; CHECK-LABEL: test_sqrt(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sqrt_param_0];
-; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK: sqrt.rn.f32 [[RF:%f[0-9]+]], [[AF]];
-; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_sqrt(half %a) #0 {
- %r = call half @llvm.sqrt.f16(half %a)
- ret half %r
-}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_powi(
-;define half @test_powi(half %a, i32 %b) #0 {
-; %r = call half @llvm.powi.f16(half %a, i32 %b)
-; ret half %r
-;}
-
-; CHECK-LABEL: test_sin(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sin_param_0];
-; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK: sin.approx.f32 [[RF:%f[0-9]+]], [[AF]];
-; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_sin(half %a) #0 #1 {
- %r = call half @llvm.sin.f16(half %a)
- ret half %r
-}
-
-; CHECK-LABEL: test_cos(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_cos_param_0];
-; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK: cos.approx.f32 [[RF:%f[0-9]+]], [[AF]];
-; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_cos(half %a) #0 #1 {
- %r = call half @llvm.cos.f16(half %a)
- ret half %r
-}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_pow(
-;define half @test_pow(half %a, half %b) #0 {
-; %r = call half @llvm.pow.f16(half %a, half %b)
-; ret half %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_exp(
-;define half @test_exp(half %a) #0 {
-; %r = call half @llvm.exp.f16(half %a)
-; ret half %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_exp2(
-;define half @test_exp2(half %a) #0 {
-; %r = call half @llvm.exp2.f16(half %a)
-; ret half %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log(
-;define half @test_log(half %a) #0 {
-; %r = call half @llvm.log.f16(half %a)
-; ret half %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log10(
-;define half @test_log10(half %a) #0 {
-; %r = call half @llvm.log10.f16(half %a)
-; ret half %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log2(
-;define half @test_log2(half %a) #0 {
-; %r = call half @llvm.log2.f16(half %a)
-; ret half %r
-;}
-
-; CHECK-LABEL: test_fma(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fma_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fma_param_1];
-; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fma_param_2];
-; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret
-define half @test_fma(half %a, half %b, half %c) #0 {
- %r = call half @llvm.fma.f16(half %a, half %b, half %c)
- ret half %r
-}
-
-; CHECK-LABEL: test_fabs(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fabs_param_0];
-; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK: abs.f32 [[RF:%f[0-9]+]], [[AF]];
-; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_fabs(half %a) #0 {
- %r = call half @llvm.fabs.f16(half %a)
- ret half %r
-}
-
-; CHECK-LABEL: test_minnum(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_minnum_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_minnum_param_1];
-; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK: min.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]];
-; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_minnum(half %a, half %b) #0 {
- %r = call half @llvm.minnum.f16(half %a, half %b)
- ret half %r
-}
-
-; CHECK-LABEL: test_maxnum(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_maxnum_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_maxnum_param_1];
-; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
-; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
-; CHECK: max.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]];
-; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_maxnum(half %a, half %b) #0 {
- %r = call half @llvm.maxnum.f16(half %a, half %b)
- ret half %r
-}
-
-; CHECK-LABEL: test_copysign(
-; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_param_0];
-; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_param_1];
-; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]];
-; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]];
-; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767;
-; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768;
-; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]];
-; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_copysign(half %a, half %b) #0 {
- %r = call half @llvm.copysign.f16(half %a, half %b)
- ret half %r
-}
-
-; CHECK-LABEL: test_copysign_f32(
-; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f32_param_0];
-; CHECK-DAG: ld.param.f32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
-; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]];
-; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]];
-; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767;
-; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648;
-; CHECK-DAG: shr.u32 [[BX1:%r[0-9]+]], [[BX0]], 16;
-; CHECK-DAG: cvt.u16.u32 [[BX2:%rs[0-9]+]], [[BX1]];
-; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
-; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_copysign_f32(half %a, float %b) #0 {
- %tb = fptrunc float %b to half
- %r = call half @llvm.copysign.f16(half %a, half %tb)
- ret half %r
-}
-
-; CHECK-LABEL: test_copysign_f64(
-; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f64_param_0];
-; CHECK-DAG: ld.param.f64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
-; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]];
-; CHECK-DAG: mov.b64 [[B:%rd[0-9]+]], [[BD]];
-; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767;
-; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808;
-; CHECK-DAG: shr.u64 [[BX1:%rd[0-9]+]], [[BX0]], 48;
-; CHECK-DAG: cvt.u16.u64 [[BX2:%rs[0-9]+]], [[BX1]];
-; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
-; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_copysign_f64(half %a, double %b) #0 {
- %tb = fptrunc double %b to half
- %r = call half @llvm.copysign.f16(half %a, half %tb)
- ret half %r
-}
-
-; CHECK-LABEL: test_copysign_extended(
-; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_extended_param_0];
-; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_extended_param_1];
-; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]];
-; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]];
-; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767;
-; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768;
-; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]];
-; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
-; CHECK: cvt.f32.f16 [[XR:%f[0-9]+]], [[R]];
-; CHECK: st.param.f32 [func_retval0+0], [[XR]];
-; CHECK: ret;
-define float @test_copysign_extended(half %a, half %b) #0 {
- %r = call half @llvm.copysign.f16(half %a, half %b)
- %xr = fpext half %r to float
- ret float %xr
-}
-
-; CHECK-LABEL: test_floor(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_floor_param_0];
-; CHECK: cvt.rmi.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_floor(half %a) #0 {
- %r = call half @llvm.floor.f16(half %a)
- ret half %r
-}
-
-; CHECK-LABEL: test_ceil(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_ceil_param_0];
-; CHECK: cvt.rpi.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_ceil(half %a) #0 {
- %r = call half @llvm.ceil.f16(half %a)
- ret half %r
-}
-
-; CHECK-LABEL: test_trunc(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_trunc_param_0];
-; CHECK: cvt.rzi.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_trunc(half %a) #0 {
- %r = call half @llvm.trunc.f16(half %a)
- ret half %r
-}
-
-; CHECK-LABEL: test_rint(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_rint_param_0];
-; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_rint(half %a) #0 {
- %r = call half @llvm.rint.f16(half %a)
- ret half %r
-}
-
-; CHECK-LABEL: test_nearbyint(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_nearbyint_param_0];
-; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_nearbyint(half %a) #0 {
- %r = call half @llvm.nearbyint.f16(half %a)
- ret half %r
-}
-
-; CHECK-LABEL: test_round(
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_round_param_0];
-; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_round(half %a) #0 {
- %r = call half @llvm.round.f16(half %a)
- ret half %r
-}
-
-; CHECK-LABEL: test_fmuladd(
-; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmuladd_param_0];
-; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmuladd_param_1];
-; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fmuladd_param_2];
-; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_fmuladd(half %a, half %b, half %c) #0 {
- %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
- ret half %r
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { "unsafe-fp-math" = "true" }
+; ## Full FP16 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: -O0 -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
+; RUN: -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN: -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_ret_const(
+; CHECK: mov.b16 [[R:%h[0-9]+]], 0x3C00;
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_ret_const() #0 {
+ ret half 1.0
+}
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_param_1];
+; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fadd(half %a, half %b) #0 {
+ %r = fadd half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fadd_v1f16(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_v1f16_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_v1f16_param_1];
+; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 {
+ %r = fadd <1 x half> %a, %b
+ ret <1 x half> %r
+}
+
+; Check that we can lower fadd with immediate arguments.
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_0_param_0];
+; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00;
+; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fadd_imm_0(half %b) #0 {
+ %r = fadd half 1.0, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_1_param_0];
+; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00;
+; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fadd_imm_1(half %a) #0 {
+ %r = fadd half %a, 1.0
+ ret half %r
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fsub_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fsub_param_1];
+; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fsub(half %a, half %b) #0 {
+ %r = fsub half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fneg(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fneg_param_0];
+; CHECK-F16-NEXT: mov.b16 [[Z:%h[0-9]+]], 0x0000
+; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[Z]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000;
+; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[Z]], [[A32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fneg(half %a) #0 {
+ %r = fsub half 0.0, %a
+ ret half %r
+}
+
+; CHECK-LABEL: test_fmul(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmul_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmul_param_1];
+; CHECK-F16-NEXT: mul.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-NEXT: mul.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fmul(half %a, half %b) #0 {
+ %r = fmul half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG: cvt.f32.f16 [[F0:%f[0-9]+]], [[A]];
+; CHECK-DAG: cvt.f32.f16 [[F1:%f[0-9]+]], [[B]];
+; CHECK-NEXT: div.rn.f32 [[FR:%f[0-9]+]], [[F0]], [[F1]];
+; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[FR]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_fdiv(half %a, half %b) #0 {
+ %r = fdiv half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_frem(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_frem_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_frem_param_1];
+; CHECK-DAG: cvt.f32.f16 [[FA:%f[0-9]+]], [[A]];
+; CHECK-DAG: cvt.f32.f16 [[FB:%f[0-9]+]], [[B]];
+; CHECK-NEXT: div.rn.f32 [[D:%f[0-9]+]], [[FA]], [[FB]];
+; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]];
+; CHECK-NEXT: mul.f32 [[RI:%f[0-9]+]], [[DI]], [[FB]];
+; CHECK-NEXT: sub.f32 [[RF:%f[0-9]+]], [[FA]], [[RI]];
+; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_frem(half %a, half %b) #0 {
+ %r = frem half %a, %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_store(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_store_param_0];
+; CHECK-DAG: ld.param.u64 %[[PTR:rd[0-9]+]], [test_store_param_1];
+; CHECK-NEXT: st.b16 [%[[PTR]]], [[A]];
+; CHECK-NEXT: ret;
+define void @test_store(half %a, half* %b) #0 {
+ store half %a, half* %b
+ ret void
+}
+
+; CHECK-LABEL: test_load(
+; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [test_load_param_0];
+; CHECK-NEXT: ld.b16 [[R:%h[0-9]+]], [%[[PTR]]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_load(half* %a) #0 {
+ %r = load half, half* %a
+ ret half %r
+}
+
+; CHECK-LABEL: .visible .func test_halfp0a1(
+; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0];
+; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1];
+; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.u8 [%[[TO]]], [[B0]]
+; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]]
+; CHECK: ret
+define void @test_halfp0a1(half * noalias readonly %from, half * %to) {
+ %1 = load half, half * %from , align 1
+ store half %1, half * %to , align 1
+ ret void
+}
+
+declare half @test_callee(half %a, half %b) #0
+
+; CHECK-LABEL: test_call(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_param_1];
+; CHECK: {
+; CHECK-DAG: .param .b32 param0;
+; CHECK-DAG: .param .b32 param1;
+; CHECK-DAG: st.param.b16 [param0+0], [[A]];
+; CHECK-DAG: st.param.b16 [param1+0], [[B]];
+; CHECK-DAG: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_call(half %a, half %b) #0 {
+ %r = call half @test_callee(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_call_flipped(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_flipped_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_flipped_param_1];
+; CHECK: {
+; CHECK-DAG: .param .b32 param0;
+; CHECK-DAG: .param .b32 param1;
+; CHECK-DAG: st.param.b16 [param0+0], [[B]];
+; CHECK-DAG: st.param.b16 [param1+0], [[A]];
+; CHECK-DAG: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_call_flipped(half %a, half %b) #0 {
+ %r = call half @test_callee(half %b, half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_tailcall_flipped_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_tailcall_flipped_param_1];
+; CHECK: {
+; CHECK-DAG: .param .b32 param0;
+; CHECK-DAG: .param .b32 param1;
+; CHECK-DAG: st.param.b16 [param0+0], [[B]];
+; CHECK-DAG: st.param.b16 [param1+0], [[A]];
+; CHECK-DAG: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_tailcall_flipped(half %a, half %b) #0 {
+ %r = tail call half @test_callee(half %b, half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_select(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1];
+; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
+; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
+ %r = select i1 %c, half %a, half %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_select_cc(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_param_3];
+; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
+; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]]
+; CHECK: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
+ %cc = fcmp une half %c, %d
+ %r = select i1 %cc, half %a, half %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_select_cc_f32_f16(
+; CHECK-DAG: ld.param.f32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
+; CHECK-DAG: ld.param.f32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
+; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_f32_f16_param_2];
+; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_f32_f16_param_3];
+; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
+; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]]
+; CHECK-NEXT: selp.f32 [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.f32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
+ %cc = fcmp une half %c, %d
+ %r = select i1 %cc, float %a, float %b
+ ret float %r
+}
+
+; CHECK-LABEL: test_select_cc_f16_f32(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_f16_f32_param_0];
+; CHECK-DAG: ld.param.f32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
+; CHECK-DAG: ld.param.f32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
+; CHECK-DAG: setp.neu.f32 [[PRED:%p[0-9]+]], [[C]], [[D]]
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_f16_f32_param_1];
+; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
+ %cc = fcmp une float %c, %d
+ %r = select i1 %cc, half %a, half %b
+ ret half %r
+}
+
+; CHECK-LABEL: test_fcmp_une(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_une_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_une_param_1];
+; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_une(half %a, half %b) #0 {
+ %r = fcmp une half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ueq_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ueq_param_1];
+; CHECK-F16: setp.equ.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.equ.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ueq(half %a, half %b) #0 {
+ %r = fcmp ueq half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ugt_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ugt_param_1];
+; CHECK-F16: setp.gtu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.gtu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ugt(half %a, half %b) #0 {
+ %r = fcmp ugt half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uge_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uge_param_1];
+; CHECK-F16: setp.geu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.geu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_uge(half %a, half %b) #0 {
+ %r = fcmp uge half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ult_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ult_param_1];
+; CHECK-F16: setp.ltu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.ltu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ult(half %a, half %b) #0 {
+ %r = fcmp ult half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ule_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ule_param_1];
+; CHECK-F16: setp.leu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.leu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ule(half %a, half %b) #0 {
+ %r = fcmp ule half %a, %b
+ ret i1 %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uno_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uno_param_1];
+; CHECK-F16: setp.nan.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.nan.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_uno(half %a, half %b) #0 {
+ %r = fcmp uno half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_one(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_one_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_one_param_1];
+; CHECK-F16: setp.ne.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.ne.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_one(half %a, half %b) #0 {
+ %r = fcmp one half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oeq_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oeq_param_1];
+; CHECK-F16: setp.eq.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.eq.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_oeq(half %a, half %b) #0 {
+ %r = fcmp oeq half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ogt_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ogt_param_1];
+; CHECK-F16: setp.gt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.gt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ogt(half %a, half %b) #0 {
+ %r = fcmp ogt half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oge_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oge_param_1];
+; CHECK-F16: setp.ge.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.ge.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_oge(half %a, half %b) #0 {
+ %r = fcmp oge half %a, %b
+ ret i1 %r
+}
+
+; XCHECK-LABEL: test_fcmp_olt(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_olt_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_olt_param_1];
+; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_olt(half %a, half %b) #0 {
+ %r = fcmp olt half %a, %b
+ ret i1 %r
+}
+
+; XCHECK-LABEL: test_fcmp_ole(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ole_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ole_param_1];
+; CHECK-F16: setp.le.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.le.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ole(half %a, half %b) #0 {
+ %r = fcmp ole half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ord_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ord_param_1];
+; CHECK-F16: setp.num.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.num.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i1 @test_fcmp_ord(half %a, half %b) #0 {
+ %r = fcmp ord half %a, %b
+ ret i1 %r
+}
+
+; CHECK-LABEL: test_br_cc(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_br_cc_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_br_cc_param_1];
+; CHECK-DAG: ld.param.u64 %[[C:rd[0-9]+]], [test_br_cc_param_2];
+; CHECK-DAG: ld.param.u64 %[[D:rd[0-9]+]], [test_br_cc_param_3];
+; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
+; CHECK-NEXT: @[[PRED]] bra [[LABEL:LBB.*]];
+; CHECK: st.u32 [%[[C]]],
+; CHECK: [[LABEL]]:
+; CHECK: st.u32 [%[[D]]],
+; CHECK: ret;
+define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 {
+ %c = fcmp uge half %a, %b
+ br i1 %c, label %then, label %else
+then:
+ store i32 0, i32* %p1
+ ret void
+else:
+ store i32 0, i32* %p2
+ ret void
+}
+
+; CHECK-LABEL: test_phi(
+; CHECK: ld.param.u64 %[[P1:rd[0-9]+]], [test_phi_param_0];
+; CHECK: ld.b16 {{%h[0-9]+}}, [%[[P1]]];
+; CHECK: [[LOOP:LBB[0-9_]+]]:
+; CHECK: mov.b16 [[R:%h[0-9]+]], [[AB:%h[0-9]+]];
+; CHECK: ld.b16 [[AB:%h[0-9]+]], [%[[P1]]];
+; CHECK: {
+; CHECK: st.param.b64 [param0+0], %[[P1]];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_dummy
+; CHECK: }
+; CHECK: setp.eq.b32 [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 1;
+; CHECK: @[[PRED]] bra [[LOOP]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_phi(half* %p1) #0 {
+entry:
+ %a = load half, half* %p1
+ br label %loop
+loop:
+ %r = phi half [%a, %entry], [%b, %loop]
+ %b = load half, half* %p1
+ %c = call i1 @test_dummy(half* %p1)
+ br i1 %c, label %loop, label %return
+return:
+ ret half %r
+}
+declare i1 @test_dummy(half* %p1) #0
+
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i32_param_0];
+; CHECK: cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i32 @test_fptosi_i32(half %a) #0 {
+ %r = fptosi half %a to i32
+ ret i32 %r
+}
+
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i64_param_0];
+; CHECK: cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]];
+; CHECK: st.param.b64 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i64 @test_fptosi_i64(half %a) #0 {
+ %r = fptosi half %a to i64
+ ret i64 %r
+}
+
+; CHECK-LABEL: test_fptoui_i32(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i32_param_0];
+; CHECK: cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i32 @test_fptoui_i32(half %a) #0 {
+ %r = fptoui half %a to i32
+ ret i32 %r
+}
+
+; CHECK-LABEL: test_fptoui_i64(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i64_param_0];
+; CHECK: cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]];
+; CHECK: st.param.b64 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i64 @test_fptoui_i64(half %a) #0 {
+ %r = fptoui half %a to i64
+ ret i64 %r
+}
+
+; CHECK-LABEL: test_uitofp_i32(
+; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0];
+; CHECK: cvt.rn.f16.u32 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_uitofp_i32(i32 %a) #0 {
+ %r = uitofp i32 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_uitofp_i64(
+; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0];
+; CHECK: cvt.rn.f16.u64 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_uitofp_i64(i64 %a) #0 {
+ %r = uitofp i64 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32(
+; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0];
+; CHECK: cvt.rn.f16.s32 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_sitofp_i32(i32 %a) #0 {
+ %r = sitofp i32 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i64(
+; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0];
+; CHECK: cvt.rn.f16.s64 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_sitofp_i64(i64 %a) #0 {
+ %r = sitofp i64 %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_uitofp_i32_fadd(
+; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0];
+; CHECK-DAG: cvt.rn.f16.u32 [[C:%h[0-9]+]], [[A]];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_uitofp_i32_fadd_param_1];
+; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
+ %c = uitofp i32 %a to half
+ %r = fadd half %b, %c
+ ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32_fadd(
+; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0];
+; CHECK-DAG: cvt.rn.f16.s32 [[C:%h[0-9]+]], [[A]];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_sitofp_i32_fadd_param_1];
+; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]];
+; XCHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; XCHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
+; XCHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]];
+; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
+ %c = sitofp i32 %a to half
+ %r = fadd half %b, %c
+ ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_float(
+; CHECK: ld.param.f32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_fptrunc_float(float %a) #0 {
+ %r = fptrunc float %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_double(
+; CHECK: ld.param.f64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
+; CHECK: cvt.rn.f16.f64 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_fptrunc_double(double %a) #0 {
+ %r = fptrunc double %a to half
+ ret half %r
+}
+
+; CHECK-LABEL: test_fpext_float(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_float_param_0];
+; CHECK: cvt.f32.f16 [[R:%f[0-9]+]], [[A]];
+; CHECK: st.param.f32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define float @test_fpext_float(half %a) #0 {
+ %r = fpext half %a to float
+ ret float %r
+}
+
+; CHECK-LABEL: test_fpext_double(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_double_param_0];
+; CHECK: cvt.f64.f16 [[R:%fd[0-9]+]], [[A]];
+; CHECK: st.param.f64 [func_retval0+0], [[R]];
+; CHECK: ret;
+define double @test_fpext_double(half %a) #0 {
+ %r = fpext half %a to double
+ ret double %r
+}
+
+
+; CHECK-LABEL: test_bitcast_halftoi16(
+; CHECK: ld.param.b16 [[AH:%h[0-9]+]], [test_bitcast_halftoi16_param_0];
+; CHECK: mov.b16 [[AS:%rs[0-9]+]], [[AH]]
+; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[AS]]
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i16 @test_bitcast_halftoi16(half %a) #0 {
+ %r = bitcast half %a to i16
+ ret i16 %r
+}
+
+; CHECK-LABEL: test_bitcast_i16tohalf(
+; CHECK: ld.param.u16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0];
+; CHECK: mov.b16 [[AH:%h[0-9]+]], [[AS]]
+; CHECK: st.param.b16 [func_retval0+0], [[AH]];
+; CHECK: ret;
+define half @test_bitcast_i16tohalf(i16 %a) #0 {
+ %r = bitcast i16 %a to half
+ ret half %r
+}
+
+
+declare half @llvm.sqrt.f16(half %a) #0
+declare half @llvm.powi.f16(half %a, i32 %b) #0
+declare half @llvm.sin.f16(half %a) #0
+declare half @llvm.cos.f16(half %a) #0
+declare half @llvm.pow.f16(half %a, half %b) #0
+declare half @llvm.exp.f16(half %a) #0
+declare half @llvm.exp2.f16(half %a) #0
+declare half @llvm.log.f16(half %a) #0
+declare half @llvm.log10.f16(half %a) #0
+declare half @llvm.log2.f16(half %a) #0
+declare half @llvm.fma.f16(half %a, half %b, half %c) #0
+declare half @llvm.fabs.f16(half %a) #0
+declare half @llvm.minnum.f16(half %a, half %b) #0
+declare half @llvm.maxnum.f16(half %a, half %b) #0
+declare half @llvm.copysign.f16(half %a, half %b) #0
+declare half @llvm.floor.f16(half %a) #0
+declare half @llvm.ceil.f16(half %a) #0
+declare half @llvm.trunc.f16(half %a) #0
+declare half @llvm.rint.f16(half %a) #0
+declare half @llvm.nearbyint.f16(half %a) #0
+declare half @llvm.round.f16(half %a) #0
+declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
+
+; CHECK-LABEL: test_sqrt(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sqrt_param_0];
+; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK: sqrt.rn.f32 [[RF:%f[0-9]+]], [[AF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_sqrt(half %a) #0 {
+ %r = call half @llvm.sqrt.f16(half %a)
+ ret half %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_powi(
+;define half @test_powi(half %a, i32 %b) #0 {
+; %r = call half @llvm.powi.f16(half %a, i32 %b)
+; ret half %r
+;}
+
+; CHECK-LABEL: test_sin(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sin_param_0];
+; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK: sin.approx.f32 [[RF:%f[0-9]+]], [[AF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_sin(half %a) #0 #1 {
+ %r = call half @llvm.sin.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_cos(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_cos_param_0];
+; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK: cos.approx.f32 [[RF:%f[0-9]+]], [[AF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_cos(half %a) #0 #1 {
+ %r = call half @llvm.cos.f16(half %a)
+ ret half %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_pow(
+;define half @test_pow(half %a, half %b) #0 {
+; %r = call half @llvm.pow.f16(half %a, half %b)
+; ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp(
+;define half @test_exp(half %a) #0 {
+; %r = call half @llvm.exp.f16(half %a)
+; ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp2(
+;define half @test_exp2(half %a) #0 {
+; %r = call half @llvm.exp2.f16(half %a)
+; ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log(
+;define half @test_log(half %a) #0 {
+; %r = call half @llvm.log.f16(half %a)
+; ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log10(
+;define half @test_log10(half %a) #0 {
+; %r = call half @llvm.log10.f16(half %a)
+; ret half %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log2(
+;define half @test_log2(half %a) #0 {
+; %r = call half @llvm.log2.f16(half %a)
+; ret half %r
+;}
+
+; CHECK-LABEL: test_fma(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fma_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fma_param_1];
+; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fma_param_2];
+; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret
+define half @test_fma(half %a, half %b, half %c) #0 {
+ %r = call half @llvm.fma.f16(half %a, half %b, half %c)
+ ret half %r
+}
+
+; CHECK-LABEL: test_fabs(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fabs_param_0];
+; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK: abs.f32 [[RF:%f[0-9]+]], [[AF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_fabs(half %a) #0 {
+ %r = call half @llvm.fabs.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_minnum(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK: min.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_minnum(half %a, half %b) #0 {
+ %r = call half @llvm.minnum.f16(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_maxnum(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
+; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
+; CHECK: max.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]];
+; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_maxnum(half %a, half %b) #0 {
+ %r = call half @llvm.maxnum.f16(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_copysign(
+; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]];
+; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]];
+; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767;
+; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768;
+; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]];
+; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_copysign(half %a, half %b) #0 {
+ %r = call half @llvm.copysign.f16(half %a, half %b)
+ ret half %r
+}
+
+; CHECK-LABEL: test_copysign_f32(
+; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f32_param_0];
+; CHECK-DAG: ld.param.f32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
+; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]];
+; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]];
+; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767;
+; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648;
+; CHECK-DAG: shr.u32 [[BX1:%r[0-9]+]], [[BX0]], 16;
+; CHECK-DAG: cvt.u16.u32 [[BX2:%rs[0-9]+]], [[BX1]];
+; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
+; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_copysign_f32(half %a, float %b) #0 {
+ %tb = fptrunc float %b to half
+ %r = call half @llvm.copysign.f16(half %a, half %tb)
+ ret half %r
+}
+
+; CHECK-LABEL: test_copysign_f64(
+; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f64_param_0];
+; CHECK-DAG: ld.param.f64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
+; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]];
+; CHECK-DAG: mov.b64 [[B:%rd[0-9]+]], [[BD]];
+; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767;
+; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808;
+; CHECK-DAG: shr.u64 [[BX1:%rd[0-9]+]], [[BX0]], 48;
+; CHECK-DAG: cvt.u16.u64 [[BX2:%rs[0-9]+]], [[BX1]];
+; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]];
+; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_copysign_f64(half %a, double %b) #0 {
+ %tb = fptrunc double %b to half
+ %r = call half @llvm.copysign.f16(half %a, half %tb)
+ ret half %r
+}
+
+; CHECK-LABEL: test_copysign_extended(
+; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_extended_param_0];
+; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_extended_param_1];
+; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]];
+; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]];
+; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767;
+; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768;
+; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]];
+; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]];
+; CHECK: cvt.f32.f16 [[XR:%f[0-9]+]], [[R]];
+; CHECK: st.param.f32 [func_retval0+0], [[XR]];
+; CHECK: ret;
+define float @test_copysign_extended(half %a, half %b) #0 {
+ %r = call half @llvm.copysign.f16(half %a, half %b)
+ %xr = fpext half %r to float
+ ret float %xr
+}
+
+; CHECK-LABEL: test_floor(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_floor_param_0];
+; CHECK: cvt.rmi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_floor(half %a) #0 {
+ %r = call half @llvm.floor.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_ceil(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_ceil_param_0];
+; CHECK: cvt.rpi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_ceil(half %a) #0 {
+ %r = call half @llvm.ceil.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_trunc(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_trunc_param_0];
+; CHECK: cvt.rzi.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_trunc(half %a) #0 {
+ %r = call half @llvm.trunc.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_rint(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_rint_param_0];
+; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_rint(half %a) #0 {
+ %r = call half @llvm.rint.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_nearbyint(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_nearbyint_param_0];
+; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_nearbyint(half %a) #0 {
+ %r = call half @llvm.nearbyint.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_round(
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_round_param_0];
+; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_round(half %a) #0 {
+ %r = call half @llvm.round.f16(half %a)
+ ret half %r
+}
+
+; CHECK-LABEL: test_fmuladd(
+; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fmuladd_param_2];
+; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]]
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_fmuladd(half %a, half %b, half %c) #0 {
+ %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
+ ret half %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }
diff --git a/test/CodeGen/NVPTX/f16x2-instructions.ll b/test/CodeGen/NVPTX/f16x2-instructions.ll
index 33bb616d895c..5dc796ada37f 100644
--- a/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1,1426 +1,1427 @@
-; ## Full FP16 support enabled by default.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
-; RUN: -O0 -disable-post-ra -disable-fp-elim \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
-; ## FP16 support explicitly disabled.
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
-; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
-; ## FP16 is not supported by hardware.
-; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
-; RUN: -disable-post-ra -disable-fp-elim \
-; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-; CHECK-LABEL: test_ret_const(
-; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184;
-; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]];
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_ret_const() #0 {
- ret <2 x half> <half 1.0, half 2.0>
-}
-
-; CHECK-LABEL: test_extract_0(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_0_param_0];
-; CHECK: mov.b32 {[[R:%h[0-9]+]], %tmp_hi}, [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_extract_0(<2 x half> %a) #0 {
- %e = extractelement <2 x half> %a, i32 0
- ret half %e
-}
-
-; CHECK-LABEL: test_extract_1(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_1_param_0];
-; CHECK: mov.b32 {%tmp_lo, [[R:%h[0-9]+]]}, [[A]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_extract_1(<2 x half> %a) #0 {
- %e = extractelement <2 x half> %a, i32 1
- ret half %e
-}
-
-; CHECK-LABEL: test_extract_i(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_i_param_0];
-; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1];
-; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0;
-; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]];
-; CHECK: selp.b16 [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK: ret;
-define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
- %e = extractelement <2 x half> %a, i64 %idx
- ret half %e
-}
-
-; CHECK-LABEL: test_fadd(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_param_1];
-;
-; CHECK-F16-NEXT: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
-;
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
- %r = fadd <2 x half> %a, %b
- ret <2 x half> %r
-}
-
-; Check that we can lower fadd with immediate arguments.
-; CHECK-LABEL: test_fadd_imm_0(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0];
-;
-; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184;
-; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]];
-; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[IHH]];
-;
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
-; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
- %r = fadd <2 x half> <half 1.0, half 2.0>, %a
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fadd_imm_1(
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0];
-;
-; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184;
-; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]];
-; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[IHH]];
-;
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
-; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
- %r = fadd <2 x half> %a, <half 1.0, half 2.0>
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fsub(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fsub_param_0];
-;
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fsub_param_1];
-; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
-;
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
- %r = fsub <2 x half> %a, %b
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fneg(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fneg_param_0];
-;
-; CHECK-F16: mov.u32 [[I0:%r[0-9+]]], 0;
-; CHECK-F16: mov.b32 [[IHH0:%hh[0-9+]]], [[I0]];
-; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[IHH0]], [[A]];
-;
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000;
-; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[Z]], [[FA0]];
-; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[Z]], [[FA1]];
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fneg(<2 x half> %a) #0 {
- %r = fsub <2 x half> <half 0.0, half 0.0>, %a
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fmul(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmul_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmul_param_1];
-; CHECK-F16-NEXT: mul.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
-;
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-NOF16-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
- %r = fmul <2 x half> %a, %b
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fdiv(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fdiv_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fdiv_param_1];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]];
-; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]];
-; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]];
-; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
- %r = fdiv <2 x half> %a, %b
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_frem(
-; -- Load two 16x2 inputs and split them into f16 elements
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_frem_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_frem_param_1];
-; -- Split into elements
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; -- promote to f32.
-; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]];
-; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]];
-; -- frem(a[0],b[0]).
-; CHECK-DAG: div.rn.f32 [[FD0:%f[0-9]+]], [[FA0]], [[FB0]];
-; CHECK-DAG: cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]];
-; CHECK-DAG: mul.f32 [[RI0:%f[0-9]+]], [[DI0]], [[FB0]];
-; CHECK-DAG: sub.f32 [[RF0:%f[0-9]+]], [[FA0]], [[RI0]];
-; -- frem(a[1],b[1]).
-; CHECK-DAG: div.rn.f32 [[FD1:%f[0-9]+]], [[FA1]], [[FB1]];
-; CHECK-DAG: cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]];
-; CHECK-DAG: mul.f32 [[RI1:%f[0-9]+]], [[DI1]], [[FB1]];
-; CHECK-DAG: sub.f32 [[RF1:%f[0-9]+]], [[FA1]], [[RI1]];
-; -- convert back to f16.
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
-; -- merge into f16x2 and return it.
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
- %r = frem <2 x half> %a, %b
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: .func test_ldst_v2f16(
-; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0];
-; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1];
-; CHECK-DAG: ld.b32 [[E:%hh[0-9]+]], [%[[A]]]
-; CHECK: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]];
-; CHECK-DAG: st.v2.b16 [%[[B]]], {[[E0]], [[E1]]};
-; CHECK: ret;
-define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) {
- %t1 = load <2 x half>, <2 x half>* %a
- store <2 x half> %t1, <2 x half>* %b, align 16
- ret void
-}
-
-; CHECK-LABEL: .func test_ldst_v3f16(
-; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0];
-; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1];
-; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
-; number of bitshifting instructions that may change at llvm's whim.
-; So we only verify that we only issue correct number of writes using
-; correct offset, but not the values we write.
-; CHECK-DAG: ld.u64
-; CHECK-DAG: st.u32 [%[[B]]],
-; CHECK-DAG: st.b16 [%[[B]]+4],
-; CHECK: ret;
-define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) {
- %t1 = load <3 x half>, <3 x half>* %a
- store <3 x half> %t1, <3 x half>* %b, align 16
- ret void
-}
-
-; CHECK-LABEL: .func test_ldst_v4f16(
-; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0];
-; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1];
-; CHECK-DAG: ld.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]];
-; CHECK-DAG: st.v4.b16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK: ret;
-define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) {
- %t1 = load <4 x half>, <4 x half>* %a
- store <4 x half> %t1, <4 x half>* %b, align 16
- ret void
-}
-
-; CHECK-LABEL: .func test_ldst_v8f16(
-; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0];
-; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1];
-; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]];
-; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK: ret;
-define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) {
- %t1 = load <8 x half>, <8 x half>* %a
- store <8 x half> %t1, <8 x half>* %b, align 16
- ret void
-}
-
-declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
-
-; CHECK-LABEL: test_call(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_param_1];
-; CHECK: {
-; CHECK-DAG: .param .align 4 .b8 param0[4];
-; CHECK-DAG: .param .align 4 .b8 param1[4];
-; CHECK-DAG: st.param.b32 [param0+0], [[A]];
-; CHECK-DAG: st.param.b32 [param1+0], [[B]];
-; CHECK-DAG: .param .align 4 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_callee,
-; CHECK: );
-; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
- %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_call_flipped(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_flipped_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_flipped_param_1];
-; CHECK: {
-; CHECK-DAG: .param .align 4 .b8 param0[4];
-; CHECK-DAG: .param .align 4 .b8 param1[4];
-; CHECK-DAG: st.param.b32 [param0+0], [[B]];
-; CHECK-DAG: st.param.b32 [param1+0], [[A]];
-; CHECK-DAG: .param .align 4 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_callee,
-; CHECK: );
-; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
- %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_tailcall_flipped(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1];
-; CHECK: {
-; CHECK-DAG: .param .align 4 .b8 param0[4];
-; CHECK-DAG: .param .align 4 .b8 param1[4];
-; CHECK-DAG: st.param.b32 [param0+0], [[B]];
-; CHECK-DAG: st.param.b32 [param1+0], [[A]];
-; CHECK-DAG: .param .align 4 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_callee,
-; CHECK: );
-; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
-; CHECK-NEXT: }
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
- %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_select(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_param_1];
-; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2]
-; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
-; CHECK-NEXT: selp.b32 [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
- %r = select i1 %c, <2 x half> %a, <2 x half> %b
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_select_cc(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_param_1];
-; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_param_2];
-; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_param_3];
-;
-; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-;
-; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
-; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
-; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
-;
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 {
- %cc = fcmp une <2 x half> %c, %d
- %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_select_cc_f32_f16(
-; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0];
-; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1];
-; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2];
-; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3];
-;
-; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
-; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
-; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
-; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
-;
-; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
- <2 x half> %c, <2 x half> %d) #0 {
- %cc = fcmp une <2 x half> %c, %d
- %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
- ret <2 x float> %r
-}
-
-; CHECK-LABEL: test_select_cc_f16_f32(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1];
-; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2];
-; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3];
-; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]]
-; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]]
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
-; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
- <2 x float> %c, <2 x float> %d) #0 {
- %cc = fcmp une <2 x float> %c, %d
- %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fcmp_une(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_une_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_une_param_1];
-; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp une <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ueq(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1];
-; CHECK-F16: setp.equ.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp ueq <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ugt(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1];
-; CHECK-F16: setp.gtu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp ugt <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_uge(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uge_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uge_param_1];
-; CHECK-F16: setp.geu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp uge <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ult(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ult_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ult_param_1];
-; CHECK-F16: setp.ltu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp ult <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ule(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ule_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ule_param_1];
-; CHECK-F16: setp.leu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp ule <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-
-; CHECK-LABEL: test_fcmp_uno(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uno_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uno_param_1];
-; CHECK-F16: setp.nan.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp uno <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_one(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_one_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_one_param_1];
-; CHECK-F16: setp.ne.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp one <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_oeq(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1];
-; CHECK-F16: setp.eq.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp oeq <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ogt(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1];
-; CHECK-F16: setp.gt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp ogt <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_oge(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oge_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oge_param_1];
-; CHECK-F16: setp.ge.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp oge <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_olt(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_olt_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_olt_param_1];
-; CHECK-F16: setp.lt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp olt <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; XCHECK-LABEL: test_fcmp_ole(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ole_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ole_param_1];
-; CHECK-F16: setp.le.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp ole <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fcmp_ord(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ord_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ord_param_1];
-; CHECK-F16: setp.num.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
-; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
-; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
-; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
-; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-NEXT: ret;
-define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
- %r = fcmp ord <2 x half> %a, %b
- ret <2 x i1> %r
-}
-
-; CHECK-LABEL: test_fptosi_i32(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i32_param_0];
-; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]];
-; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
-; CHECK: ret;
-define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
- %r = fptosi <2 x half> %a to <2 x i32>
- ret <2 x i32> %r
-}
-
-; CHECK-LABEL: test_fptosi_i64(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i64_param_0];
-; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]];
-; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
-; CHECK: ret;
-define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
- %r = fptosi <2 x half> %a to <2 x i64>
- ret <2 x i64> %r
-}
-
-; CHECK-LABEL: test_fptoui_2xi32(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0];
-; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]];
-; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
-; CHECK: ret;
-define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
- %r = fptoui <2 x half> %a to <2 x i32>
- ret <2 x i32> %r
-}
-
-; CHECK-LABEL: test_fptoui_2xi64(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0];
-; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]];
-; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
-; CHECK: ret;
-define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
- %r = fptoui <2 x half> %a to <2 x i64>
- ret <2 x i64> %r
-}
-
-; CHECK-LABEL: test_uitofp_2xi32(
-; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0];
-; CHECK-DAG: cvt.rn.f16.u32 [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rn.f16.u32 [[R1:%h[0-9]+]], [[A1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
- %r = uitofp <2 x i32> %a to <2 x half>
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_uitofp_2xi64(
-; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0];
-; CHECK-DAG: cvt.rn.f32.u64 [[F0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rn.f32.u64 [[F1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
- %r = uitofp <2 x i64> %a to <2 x half>
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_sitofp_2xi32(
-; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0];
-; CHECK-DAG: cvt.rn.f16.s32 [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rn.f16.s32 [[R1:%h[0-9]+]], [[A1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
- %r = sitofp <2 x i32> %a to <2 x half>
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_sitofp_2xi64(
-; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0];
-; CHECK-DAG: cvt.rn.f32.s64 [[F0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rn.f32.s64 [[F1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
- %r = sitofp <2 x i64> %a to <2 x half>
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_uitofp_2xi32_fadd(
-; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1];
-; CHECK-DAG: cvt.rn.f16.u32 [[C0:%h[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rn.f16.u32 [[C1:%h[0-9]+]], [[A1]];
-
-; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
-; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]]
-; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
- %c = uitofp <2 x i32> %a to <2 x half>
- %r = fadd <2 x half> %b, %c
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_sitofp_2xi32_fadd(
-; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1];
-; CHECK-DAG: cvt.rn.f16.s32 [[C0:%h[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rn.f16.s32 [[C1:%h[0-9]+]], [[A1]];
-;
-; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
-; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]]
-; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
- %c = sitofp <2 x i32> %a to <2 x half>
- %r = fadd <2 x half> %b, %c
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fptrunc_2xfloat(
-; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[A1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
- %r = fptrunc <2 x float> %a to <2 x half>
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fptrunc_2xdouble(
-; CHECK: ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0];
-; CHECK-DAG: cvt.rn.f16.f64 [[R0:%h[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.rn.f16.f64 [[R1:%h[0-9]+]], [[A1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
- %r = fptrunc <2 x double> %a to <2 x half>
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fpext_2xfloat(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0];
-; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]];
-; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK: ret;
-define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
- %r = fpext <2 x half> %a to <2 x float>
- ret <2 x float> %r
-}
-
-; CHECK-LABEL: test_fpext_2xdouble(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0];
-; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]];
-; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK: ret;
-define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
- %r = fpext <2 x half> %a to <2 x double>
- ret <2 x double> %r
-}
-
-
-; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
-; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0];
-; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]]
-; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16
-; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]]
-; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}
-; CHECK: ret;
-define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
- %r = bitcast <2 x half> %a to <2 x i16>
- ret <2 x i16> %r
-}
-
-; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf(
-; CHECK: ld.param.v2.u16 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0];
-; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RS0]];
-; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RS1]];
-; CHECK-DAG: shl.b32 [[R1H:%r[0-9]+]], [[R1]], 16;
-; CHECK-DAG: or.b32 [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], [[R1H0L]];
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
- %r = bitcast <2 x i16> %a to <2 x half>
- ret <2 x half> %r
-}
-
-
-declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0
-declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0
-declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.log.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
-declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0
-declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0
-declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0
-declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.round.f16(<2 x half> %a) #0
-declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
-
-; CHECK-LABEL: test_sqrt(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sqrt_param_0];
-; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_sqrt(<2 x half> %a) #0 {
- %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a)
- ret <2 x half> %r
-}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_powi(
-;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 {
-; %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b)
-; ret <2 x half> %r
-;}
-
-; CHECK-LABEL: test_sin(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sin_param_0];
-; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
- %r = call <2 x half> @llvm.sin.f16(<2 x half> %a)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_cos(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_cos_param_0];
-; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
- %r = call <2 x half> @llvm.cos.f16(<2 x half> %a)
- ret <2 x half> %r
-}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_pow(
-;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 {
-; %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b)
-; ret <2 x half> %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_exp(
-;define <2 x half> @test_exp(<2 x half> %a) #0 {
-; %r = call <2 x half> @llvm.exp.f16(<2 x half> %a)
-; ret <2 x half> %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_exp2(
-;define <2 x half> @test_exp2(<2 x half> %a) #0 {
-; %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a)
-; ret <2 x half> %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log(
-;define <2 x half> @test_log(<2 x half> %a) #0 {
-; %r = call <2 x half> @llvm.log.f16(<2 x half> %a)
-; ret <2 x half> %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log10(
-;define <2 x half> @test_log10(<2 x half> %a) #0 {
-; %r = call <2 x half> @llvm.log10.f16(<2 x half> %a)
-; ret <2 x half> %r
-;}
-
-;;; Can't do this yet: requires libcall.
-; XCHECK-LABEL: test_log2(
-;define <2 x half> @test_log2(<2 x half> %a) #0 {
-; %r = call <2 x half> @llvm.log2.f16(<2 x half> %a)
-; ret <2 x half> %r
-;}
-
-; CHECK-LABEL: test_fma(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fma_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fma_param_1];
-; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fma_param_2];
-;
-; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret
-define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
- %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fabs(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fabs_param_0];
-; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]];
-; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_fabs(<2 x half> %a) #0 {
- %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_minnum(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_minnum_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_minnum_param_1];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]];
-; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]];
-; CHECK-DAG: min.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
-; CHECK-DAG: min.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
- %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_maxnum(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_maxnum_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_maxnum_param_1];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
-; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]];
-; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]];
-; CHECK-DAG: max.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
-; CHECK-DAG: max.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
-; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
- %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_copysign(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_param_1];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]];
-; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]];
-; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767;
-; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768;
-; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768;
-; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
-; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
-; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
- %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_copysign_f32(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f32_param_0];
-; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]];
-; CHECK-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]];
-; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767;
-; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
-; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
-; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16;
-; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16;
-; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]];
-; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]];
-; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
-; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
-; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
- %tb = fptrunc <2 x float> %b to <2 x half>
- %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_copysign_f64(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f64_param_0];
-; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]];
-; CHECK-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]];
-; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767;
-; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
-; CHECK-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
-; CHECK-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48;
-; CHECK-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48;
-; CHECK-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]];
-; CHECK-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]];
-; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
-; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
-; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
- %tb = fptrunc <2 x double> %b to <2 x half>
- %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_copysign_extended(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_extended_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_extended_param_1];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
-; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
-; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]];
-; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]];
-; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767;
-; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767;
-; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768;
-; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768;
-; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
-; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
-; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
-; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
-; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: mov.b32 {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]]
-; CHECK-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[RX0]];
-; CHECK-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[RX1]];
-; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]};
-; CHECK: ret;
-define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
- %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
- %xr = fpext <2 x half> %r to <2 x float>
- ret <2 x float> %xr
-}
-
-; CHECK-LABEL: test_floor(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_floor_param_0];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_floor(<2 x half> %a) #0 {
- %r = call <2 x half> @llvm.floor.f16(<2 x half> %a)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_ceil(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_ceil_param_0];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_ceil(<2 x half> %a) #0 {
- %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_trunc(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_trunc_param_0];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_trunc(<2 x half> %a) #0 {
- %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_rint(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_rint_param_0];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_rint(<2 x half> %a) #0 {
- %r = call <2 x half> @llvm.rint.f16(<2 x half> %a)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_nearbyint(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_nearbyint_param_0];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
- %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_round(
-; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0];
-; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
-; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
-; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
-; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_round(<2 x half> %a) #0 {
- %r = call <2 x half> @llvm.round.f16(<2 x half> %a)
- ret <2 x half> %r
-}
-
-; CHECK-LABEL: test_fmuladd(
-; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmuladd_param_0];
-; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmuladd_param_1];
-; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fmuladd_param_2];
-;
-; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
-;
-; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
-; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
-; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
-; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
-; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
-; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
-; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
-;
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
- %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
- ret <2 x half> %r
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { "unsafe-fp-math" = "true" }
+; ## Full FP16 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: -O0 -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
+; RUN: -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN: -disable-post-ra -disable-fp-elim -verify-machineinstrs \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_ret_const(
+; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184;
+; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_ret_const() #0 {
+ ret <2 x half> <half 1.0, half 2.0>
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_0_param_0];
+; CHECK: mov.b32 {[[R:%h[0-9]+]], %tmp_hi}, [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_extract_0(<2 x half> %a) #0 {
+ %e = extractelement <2 x half> %a, i32 0
+ ret half %e
+}
+
+; CHECK-LABEL: test_extract_1(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_1_param_0];
+; CHECK: mov.b32 {%tmp_lo, [[R:%h[0-9]+]]}, [[A]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_extract_1(<2 x half> %a) #0 {
+ %e = extractelement <2 x half> %a, i32 1
+ ret half %e
+}
+
+; CHECK-LABEL: test_extract_i(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_i_param_0];
+; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1];
+; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0;
+; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]];
+; CHECK: selp.b16 [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK: ret;
+define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
+ %e = extractelement <2 x half> %a, i64 %idx
+ ret half %e
+}
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_param_1];
+;
+; CHECK-F16-NEXT: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fadd <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; Check that we can lower fadd with immediate arguments.
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0];
+;
+; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]];
+; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[IHH]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
+ %r = fadd <2 x half> <half 1.0, half 2.0>, %a
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0];
+;
+; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]];
+; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[IHH]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
+ %r = fadd <2 x half> %a, <half 1.0, half 2.0>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fsub_param_0];
+;
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fsub_param_1];
+; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fsub <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fneg(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fneg_param_0];
+;
+; CHECK-F16: mov.u32 [[I0:%r[0-9+]]], 0;
+; CHECK-F16: mov.b32 [[IHH0:%hh[0-9+]]], [[I0]];
+; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[IHH0]], [[A]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000;
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[Z]], [[FA0]];
+; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[Z]], [[FA1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fneg(<2 x half> %a) #0 {
+ %r = fsub <2 x half> <half 0.0, half 0.0>, %a
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fmul(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmul_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmul_param_1];
+; CHECK-F16-NEXT: mul.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fmul <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fdiv <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_frem(
+; -- Load two 16x2 inputs and split them into f16 elements
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_frem_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_frem_param_1];
+; -- Split into elements
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; -- promote to f32.
+; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]];
+; -- frem(a[0],b[0]).
+; CHECK-DAG: div.rn.f32 [[FD0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG: cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]];
+; CHECK-DAG: mul.f32 [[RI0:%f[0-9]+]], [[DI0]], [[FB0]];
+; CHECK-DAG: sub.f32 [[RF0:%f[0-9]+]], [[FA0]], [[RI0]];
+; -- frem(a[1],b[1]).
+; CHECK-DAG: div.rn.f32 [[FD1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG: cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]];
+; CHECK-DAG: mul.f32 [[RI1:%f[0-9]+]], [[DI1]], [[FB1]];
+; CHECK-DAG: sub.f32 [[RF1:%f[0-9]+]], [[FA1]], [[RI1]];
+; -- convert back to f16.
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; -- merge into f16x2 and return it.
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
+ %r = frem <2 x half> %a, %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: .func test_ldst_v2f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1];
+; CHECK-DAG: ld.b32 [[E:%hh[0-9]+]], [%[[A]]]
+; CHECK: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]];
+; CHECK-DAG: st.v2.b16 [%[[B]]], {[[E0]], [[E1]]};
+; CHECK: ret;
+define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) {
+ %t1 = load <2 x half>, <2 x half>* %a
+ store <2 x half> %t1, <2 x half>* %b, align 16
+ ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v3f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1];
+; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
+; number of bitshifting instructions that may change at llvm's whim.
+; So we only verify that we only issue correct number of writes using
+; correct offset, but not the values we write.
+; CHECK-DAG: ld.u64
+; CHECK-DAG: st.u32 [%[[B]]],
+; CHECK-DAG: st.b16 [%[[B]]+4],
+; CHECK: ret;
+define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) {
+ %t1 = load <3 x half>, <3 x half>* %a
+ store <3 x half> %t1, <3 x half>* %b, align 16
+ ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v4f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1];
+; CHECK-DAG: ld.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]];
+; CHECK-DAG: st.v4.b16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: ret;
+define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) {
+ %t1 = load <4 x half>, <4 x half>* %a
+ store <4 x half> %t1, <4 x half>* %b, align 16
+ ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v8f16(
+; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0];
+; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1];
+; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]];
+; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: ret;
+define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) {
+ %t1 = load <8 x half>, <8 x half>* %a
+ store <8 x half> %t1, <8 x half>* %b, align 16
+ ret void
+}
+
+declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
+
+; CHECK-LABEL: test_call(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_param_1];
+; CHECK: {
+; CHECK-DAG: .param .align 4 .b8 param0[4];
+; CHECK-DAG: .param .align 4 .b8 param1[4];
+; CHECK-DAG: st.param.b32 [param0+0], [[A]];
+; CHECK-DAG: st.param.b32 [param1+0], [[B]];
+; CHECK-DAG: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_call_flipped(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_flipped_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_flipped_param_1];
+; CHECK: {
+; CHECK-DAG: .param .align 4 .b8 param0[4];
+; CHECK-DAG: .param .align 4 .b8 param1[4];
+; CHECK-DAG: st.param.b32 [param0+0], [[B]];
+; CHECK-DAG: st.param.b32 [param1+0], [[A]];
+; CHECK-DAG: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1];
+; CHECK: {
+; CHECK-DAG: .param .align 4 .b8 param0[4];
+; CHECK-DAG: .param .align 4 .b8 param1[4];
+; CHECK-DAG: st.param.b32 [param0+0], [[B]];
+; CHECK-DAG: st.param.b32 [param1+0], [[A]];
+; CHECK-DAG: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK: );
+; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
+ %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_param_1];
+; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2]
+; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
+; CHECK-NEXT: selp.b32 [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
+ %r = select i1 %c, <2 x half> %a, <2 x half> %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select_cc(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_param_3];
+;
+; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+;
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
+; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 {
+ %cc = fcmp une <2 x half> %c, %d
+ %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select_cc_f32_f16(
+; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0];
+; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2];
+; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3];
+;
+; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
+; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
+ <2 x half> %c, <2 x half> %d) #0 {
+ %cc = fcmp une <2 x half> %c, %d
+ %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_select_cc_f16_f32(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1];
+; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2];
+; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3];
+; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]]
+; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]]
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
+ <2 x float> %c, <2 x float> %d) #0 {
+ %cc = fcmp une <2 x float> %c, %d
+ %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fcmp_une(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_une_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_une_param_1];
+; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp une <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1];
+; CHECK-F16: setp.equ.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ueq <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1];
+; CHECK-F16: setp.gtu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ugt <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uge_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uge_param_1];
+; CHECK-F16: setp.geu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp uge <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ult_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ult_param_1];
+; CHECK-F16: setp.ltu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ult <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ule_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ule_param_1];
+; CHECK-F16: setp.leu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ule <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uno_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uno_param_1];
+; CHECK-F16: setp.nan.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp uno <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_one(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_one_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_one_param_1];
+; CHECK-F16: setp.ne.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp one <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1];
+; CHECK-F16: setp.eq.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp oeq <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1];
+; CHECK-F16: setp.gt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ogt <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oge_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oge_param_1];
+; CHECK-F16: setp.ge.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp oge <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_olt(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_olt_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_olt_param_1];
+; CHECK-F16: setp.lt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp olt <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; XCHECK-LABEL: test_fcmp_ole(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ole_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ole_param_1];
+; CHECK-F16: setp.le.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ole <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ord_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ord_param_1];
+; CHECK-F16: setp.num.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
+ %r = fcmp ord <2 x half> %a, %b
+ ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i32_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
+ %r = fptosi <2 x half> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i64_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
+ %r = fptosi <2 x half> %a to <2 x i64>
+ ret <2 x i64> %r
+}
+
+; CHECK-LABEL: test_fptoui_2xi32(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
+ %r = fptoui <2 x half> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+; CHECK-LABEL: test_fptoui_2xi64(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]];
+; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
+ %r = fptoui <2 x half> %a to <2 x i64>
+ ret <2 x i64> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi32(
+; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0];
+; CHECK-DAG: cvt.rn.f16.u32 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.u32 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
+ %r = uitofp <2 x i32> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi64(
+; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0];
+; CHECK-DAG: cvt.rn.f32.u64 [[F0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f32.u64 [[F1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
+ %r = uitofp <2 x i64> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi32(
+; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0];
+; CHECK-DAG: cvt.rn.f16.s32 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.s32 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
+ %r = sitofp <2 x i32> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi64(
+; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0];
+; CHECK-DAG: cvt.rn.f32.s64 [[F0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f32.s64 [[F1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
+ %r = sitofp <2 x i64> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1];
+; CHECK-DAG: cvt.rn.f16.u32 [[C0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.u32 [[C1:%h[0-9]+]], [[A1]];
+
+; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+ %c = uitofp <2 x i32> %a to <2 x half>
+ %r = fadd <2 x half> %b, %c
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi32_fadd(
+; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1];
+; CHECK-DAG: cvt.rn.f16.s32 [[C0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.s32 [[C1:%h[0-9]+]], [[A1]];
+;
+; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]]
+; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+ %c = sitofp <2 x i32> %a to <2 x half>
+ %r = fadd <2 x half> %b, %c
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xfloat(
+; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
+ %r = fptrunc <2 x float> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xdouble(
+; CHECK: ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0];
+; CHECK-DAG: cvt.rn.f16.f64 [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.rn.f16.f64 [[R1:%h[0-9]+]], [[A1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
+ %r = fptrunc <2 x double> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fpext_2xfloat(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK: ret;
+define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
+ %r = fpext <2 x half> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_fpext_2xdouble(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK: ret;
+define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
+ %r = fpext <2 x half> %a to <2 x double>
+ ret <2 x double> %r
+}
+
+
+; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
+; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0];
+; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]]
+; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16
+; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]]
+; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK: ret;
+define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
+ %r = bitcast <2 x half> %a to <2 x i16>
+ ret <2 x i16> %r
+}
+
+; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf(
+; CHECK: ld.param.v2.u16 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0];
+; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RS0]];
+; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RS1]];
+; CHECK-DAG: shl.b32 [[R1H:%r[0-9]+]], [[R1]], 16;
+; CHECK-DAG: or.b32 [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], [[R1H0L]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
+ %r = bitcast <2 x i16> %a to <2 x half>
+ ret <2 x half> %r
+}
+
+
+declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0
+declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
+declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.round.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
+
+; CHECK-LABEL: test_sqrt(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sqrt_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sqrt(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_powi(
+;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 {
+; %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b)
+; ret <2 x half> %r
+;}
+
+; CHECK-LABEL: test_sin(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sin_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
+ %r = call <2 x half> @llvm.sin.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_cos(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_cos_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
+ %r = call <2 x half> @llvm.cos.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_pow(
+;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 {
+; %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp(
+;define <2 x half> @test_exp(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.exp.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp2(
+;define <2 x half> @test_exp2(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log(
+;define <2 x half> @test_log(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.log.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log10(
+;define <2 x half> @test_log10(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.log10.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log2(
+;define <2 x half> @test_log2(<2 x half> %a) #0 {
+; %r = call <2 x half> @llvm.log2.f16(<2 x half> %a)
+; ret <2 x half> %r
+;}
+
+; CHECK-LABEL: test_fma(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fma_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fma_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fma_param_2];
+;
+; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret
+define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+ %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fabs(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fabs_param_0];
+; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fabs(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_minnum(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]];
+; CHECK-DAG: min.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
+; CHECK-DAG: min.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_maxnum(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]];
+; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]];
+; CHECK-DAG: max.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
+; CHECK-DAG: max.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768;
+; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768;
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_f32(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f32_param_0];
+; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
+; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
+; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16;
+; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16;
+; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
+ %tb = fptrunc <2 x float> %b to <2 x half>
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_f64(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f64_param_0];
+; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
+; CHECK-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
+; CHECK-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48;
+; CHECK-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48;
+; CHECK-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
+ %tb = fptrunc <2 x double> %b to <2 x half>
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_extended(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_extended_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_extended_param_1];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]];
+; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]];
+; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768;
+; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768;
+; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: mov.b32 {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]]
+; CHECK-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[RX0]];
+; CHECK-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[RX1]];
+; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]};
+; CHECK: ret;
+define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
+ %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
+ %xr = fpext <2 x half> %r to <2 x float>
+ ret <2 x float> %xr
+}
+
+; CHECK-LABEL: test_floor(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_floor_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_floor(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.floor.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_ceil(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_ceil_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_ceil(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_trunc(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_trunc_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_trunc(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_rint(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_rint_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_rint(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.rint.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_nearbyint(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_nearbyint_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_round(
+; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0];
+; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_round(<2 x half> %a) #0 {
+ %r = call <2 x half> @llvm.round.f16(<2 x half> %a)
+ ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fmuladd(
+; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fmuladd_param_2];
+;
+; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+ %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+ ret <2 x half> %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }
diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll
index 6785a01827e2..351f9b20dc0c 100644
--- a/test/CodeGen/NVPTX/fma.ll
+++ b/test/CodeGen/NVPTX/fma.ll
@@ -1,42 +1,42 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
-
-declare float @dummy_f32(float, float) #0
-declare double @dummy_f64(double, double) #0
-
-define ptx_device float @t1_f32(float %x, float %y, float %z) {
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
- %a = fmul float %x, %y
- %b = fadd float %a, %z
- ret float %b
-}
-
-define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: ret;
- %a = fmul float %x, %y
- %b = fadd float %a, %z
- %c = fadd float %a, %w
- %d = call float @dummy_f32(float %b, float %c)
- ret float %d
-}
-
-define ptx_device double @t1_f64(double %x, double %y, double %z) {
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
- %a = fmul double %x, %y
- %b = fadd double %a, %z
- ret double %b
-}
-
-define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: ret;
- %a = fmul double %x, %y
- %b = fadd double %a, %z
- %c = fadd double %a, %w
- %d = call double @dummy_f64(double %b, double %c)
- ret double %d
-}
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | FileCheck %s
+
+declare float @dummy_f32(float, float) #0
+declare double @dummy_f64(double, double) #0
+
+define ptx_device float @t1_f32(float %x, float %y, float %z) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+ %a = fmul float %x, %y
+ %b = fadd float %a, %z
+ ret float %b
+}
+
+define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+ %a = fmul float %x, %y
+ %b = fadd float %a, %z
+ %c = fadd float %a, %w
+ %d = call float @dummy_f32(float %b, float %c)
+ ret float %d
+}
+
+define ptx_device double @t1_f64(double %x, double %y, double %z) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+ %a = fmul double %x, %y
+ %b = fadd double %a, %z
+ ret double %b
+}
+
+define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) {
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK: ret;
+ %a = fmul double %x, %y
+ %b = fadd double %a, %z
+ %c = fadd double %a, %w
+ %d = call double @dummy_f64(double %b, double %c)
+ ret double %d
+}
diff --git a/test/CodeGen/NVPTX/i8-param.ll b/test/CodeGen/NVPTX/i8-param.ll
index 6a1e3a0e1a0d..c41da0eebd1f 100644
--- a/test/CodeGen/NVPTX/i8-param.ll
+++ b/test/CodeGen/NVPTX/i8-param.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-
-; CHECK: .visible .func (.param .b32 func_retval0) callee
-define i8 @callee(i8 %a) {
-; CHECK: ld.param.u8
- %ret = add i8 %a, 42
-; CHECK: st.param.b32
- ret i8 %ret
-}
-
-; CHECK: .visible .func caller
-define void @caller(i8* %a) {
-; CHECK: ld.u8
- %val = load i8, i8* %a
- %ret = tail call i8 @callee(i8 %val)
-; CHECK: ld.param.b32
- store i8 %ret, i8* %a
- ret void
-}
-
-
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+; CHECK: .visible .func (.param .b32 func_retval0) callee
+define i8 @callee(i8 %a) {
+; CHECK: ld.param.u8
+ %ret = add i8 %a, 42
+; CHECK: st.param.b32
+ ret i8 %ret
+}
+
+; CHECK: .visible .func caller
+define void @caller(i8* %a) {
+; CHECK: ld.u8
+ %val = load i8, i8* %a
+ %ret = tail call i8 @callee(i8 %val)
+; CHECK: ld.param.b32
+ store i8 %ret, i8* %a
+ ret void
+}
+
+
diff --git a/test/CodeGen/NVPTX/param-load-store.ll b/test/CodeGen/NVPTX/param-load-store.ll
index 8a67567acc96..83991a2930a8 100644
--- a/test/CodeGen/NVPTX/param-load-store.ll
+++ b/test/CodeGen/NVPTX/param-load-store.ll
@@ -1,939 +1,939 @@
-; Verifies correctness of load/store of parameters and return values.
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s
-
-%s_i1 = type { i1 }
-%s_i8 = type { i8 }
-%s_i16 = type { i16 }
-%s_f16 = type { half }
-%s_i32 = type { i32 }
-%s_f32 = type { float }
-%s_i64 = type { i64 }
-%s_f64 = type { double }
-
-; More complicated types. i64 is used to increase natural alignment
-; requirement for the type.
-%s_i32x4 = type { i32, i32, i32, i32, i64}
-%s_i32f32 = type { i32, float, i32, float, i64}
-%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64}
-%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}>
-%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]}
-; All scalar parameters must be at least 32 bits in size.
-; i1 is loaded/stored as i8.
-
-; CHECK: .func (.param .b32 func_retval0)
-; CHECK-LABEL: test_i1(
-; CHECK-NEXT: .param .b32 test_i1_param_0
-; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0];
-; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1;
-; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0+0], [[A]]
-; CHECK: .param .b32 retval0;
-; CHECK: call.uni
-; CHECK-NEXT: test_i1,
-; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0];
-; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1;
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK: ret;
-define i1 @test_i1(i1 %a) {
- %r = tail call i1 @test_i1(i1 %a);
- ret i1 %r;
-}
-
-; Signed i1 is a somewhat special case. We only care about one bit and
-; then us neg.s32 to convert it to 32-bit -1 if it's set.
-; CHECK: .func (.param .b32 func_retval0)
-; CHECK-LABEL: test_i1s(
-; CHECK-NEXT: .param .b32 test_i1s_param_0
-; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
-; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
-; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1;
-; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]];
-; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0+0], [[A]];
-; CHECK: .param .b32 retval0;
-; CHECK: call.uni
-; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0];
-; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1;
-; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]];
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define signext i1 @test_i1s(i1 signext %a) {
- %r = tail call signext i1 @test_i1s(i1 signext %a);
- ret i1 %r;
-}
-
-; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment.
-; CHECK: .func (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_v3i1(
-; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4]
-; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
-; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0]
-; CHECK: .param .align 4 .b8 param0[4];
-; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]};
-; CHECK-DAG: st.param.b8 [param0+2], [[E2]];
-; CHECK: .param .align 4 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v3i1,
-; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
-; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}
-; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]];
-; CHECK-NEXT: ret;
-define <3 x i1> @test_v3i1(<3 x i1> %a) {
- %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a);
- ret <3 x i1> %r;
-}
-
-; CHECK: .func (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_v4i1(
-; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4]
-; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0]
-; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK: .param .align 4 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK: test_v4i1,
-; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]};
-; CHECK-NEXT: ret;
-define <4 x i1> @test_v4i1(<4 x i1> %a) {
- %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a);
- ret <4 x i1> %r;
-}
-
-; CHECK: .func (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v5i1(
-; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8]
-; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
-; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0]
-; CHECK: .param .align 8 .b8 param0[8];
-; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
-; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v5i1,
-; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4];
-; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]];
-; CHECK-NEXT: ret;
-define <5 x i1> @test_v5i1(<5 x i1> %a) {
- %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a);
- ret <5 x i1> %r;
-}
-
-; Unsigned i8 is loaded directly into 32-bit register.
-; CHECK: .func (.param .b32 func_retval0)
-; CHECK-LABEL: test_i8(
-; CHECK-NEXT: .param .b32 test_i8_param_0
-; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0];
-; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
-; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255;
-; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0+0], [[A]];
-; CHECK: .param .b32 retval0;
-; CHECK: call.uni (retval0),
-; CHECK: test_i8,
-; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0];
-; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255;
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i8 @test_i8(i8 %a) {
- %r = tail call i8 @test_i8(i8 %a);
- ret i8 %r;
-}
-
-; signed i8 is loaded into 16-bit register which is then sign-extended to i32.
-; CHECK: .func (.param .b32 func_retval0)
-; CHECK-LABEL: test_i8s(
-; CHECK-NEXT: .param .b32 test_i8s_param_0
-; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0];
-; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]];
-; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0+0], [[A]];
-; CHECK: .param .b32 retval0;
-; CHECK: call.uni (retval0),
-; CHECK: test_i8s,
-; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0];
-; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
-; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]];
-; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]];
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define signext i8 @test_i8s(i8 signext %a) {
- %r = tail call signext i8 @test_i8s(i8 signext %a);
- ret i8 %r;
-}
-
-; CHECK: .func (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_v3i8(
-; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
-; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2];
-; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0];
-; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]};
-; CHECK: st.param.b8 [param0+2], [[E2]];
-; CHECK: .param .align 4 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v3i8,
-; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
-; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]];
-; CHECK-NEXT: ret;
-define <3 x i8> @test_v3i8(<3 x i8> %a) {
- %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a);
- ret <3 x i8> %r;
-}
-
-; CHECK: .func (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_v4i8(
-; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
-; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0]
-; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK: .param .align 4 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v4i8,
-; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-NEXT: ret;
-define <4 x i8> @test_v4i8(<4 x i8> %a) {
- %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a);
- ret <4 x i8> %r;
-}
-
-; CHECK: .func (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v5i8(
-; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
-; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
-; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0]
-; CHECK: .param .align 8 .b8 param0[8];
-; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
-; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v5i8,
-; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4];
-; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]];
-; CHECK-NEXT: ret;
-define <5 x i8> @test_v5i8(<5 x i8> %a) {
- %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a);
- ret <5 x i8> %r;
-}
-
-; CHECK: .func (.param .b32 func_retval0)
-; CHECK-LABEL: test_i16(
-; CHECK-NEXT: .param .b32 test_i16_param_0
-; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0];
-; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]];
-; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0+0], [[E32]];
-; CHECK: .param .b32 retval0;
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_i16,
-; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0];
-; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535;
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i16 @test_i16(i16 %a) {
- %r = tail call i16 @test_i16(i16 %a);
- ret i16 %r;
-}
-
-; CHECK: .func (.param .b32 func_retval0)
-; CHECK-LABEL: test_i16s(
-; CHECK-NEXT: .param .b32 test_i16s_param_0
-; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0];
-; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]];
-; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0+0], [[E32]];
-; CHECK: .param .b32 retval0;
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_i16s,
-; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0];
-; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]];
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define signext i16 @test_i16s(i16 signext %a) {
- %r = tail call signext i16 @test_i16s(i16 signext %a);
- ret i16 %r;
-}
-
-; CHECK: .func (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v3i16(
-; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
-; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
-; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0];
-; CHECK: .param .align 8 .b8 param0[8];
-; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
-; CHECK: st.param.b16 [param0+4], [[E2]];
-; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v3i16,
-; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
-; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4];
-; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]];
-; CHECK-NEXT: ret;
-define <3 x i16> @test_v3i16(<3 x i16> %a) {
- %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a);
- ret <3 x i16> %r;
-}
-
-; CHECK: .func (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v4i16(
-; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
-; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0]
-; CHECK: .param .align 8 .b8 param0[8];
-; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v4i16,
-; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-NEXT: ret;
-define <4 x i16> @test_v4i16(<4 x i16> %a) {
- %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a);
- ret <4 x i16> %r;
-}
-
-; CHECK: .func (.param .align 16 .b8 func_retval0[16])
-; CHECK-LABEL: test_v5i16(
-; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
-; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
-; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
-; CHECK: .param .align 16 .b8 retval0[16];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v5i16,
-; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8];
-; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]];
-; CHECK-NEXT: ret;
-define <5 x i16> @test_v5i16(<5 x i16> %a) {
- %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a);
- ret <5 x i16> %r;
-}
-
-; CHECK: .func (.param .b32 func_retval0)
-; CHECK-LABEL: test_f16(
-; CHECK-NEXT: .param .b32 test_f16_param_0
-; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_f16_param_0];
-; CHECK: .param .b32 param0;
-; CHECK: st.param.b16 [param0+0], [[E]];
-; CHECK: .param .b32 retval0;
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_f16,
-; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
-; CHECK: st.param.b16 [func_retval0+0], [[R]]
-; CHECK-NEXT: ret;
-define half @test_f16(half %a) {
- %r = tail call half @test_f16(half %a);
- ret half %r;
-}
-
-; CHECK: .func (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_v2f16(
-; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
-; CHECK: ld.param.b32 [[E:%hh[0-9]+]], [test_v2f16_param_0];
-; CHECK: .param .align 4 .b8 param0[4];
-; CHECK: st.param.b32 [param0+0], [[E]];
-; CHECK: .param .align 4 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v2f16,
-; CHECK: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
-; CHECK: st.param.b32 [func_retval0+0], [[R]]
-; CHECK-NEXT: ret;
-define <2 x half> @test_v2f16(<2 x half> %a) {
- %r = tail call <2 x half> @test_v2f16(<2 x half> %a);
- ret <2 x half> %r;
-}
-
-; CHECK:.func (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v3f16(
-; CHECK: .param .align 8 .b8 test_v3f16_param_0[8]
-; CHECK-DAG: ld.param.b32 [[HH01:%hh[0-9]+]], [test_v3f16_param_0];
-; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
-; CHECK-DAG: ld.param.b16 [[E2:%h[0-9]+]], [test_v3f16_param_0+4];
-; CHECK: .param .align 8 .b8 param0[8];
-; CHECK-DAG: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
-; CHECK-DAG: st.param.b16 [param0+4], [[E2]];
-; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: call.uni (retval0),
-; CHECK: test_v3f16,
-; CHECK-DAG: ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.b16 [[R2:%h[0-9]+]], [retval0+4];
-; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]};
-; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]];
-; CHECK: ret;
-define <3 x half> @test_v3f16(<3 x half> %a) {
- %r = tail call <3 x half> @test_v3f16(<3 x half> %a);
- ret <3 x half> %r;
-}
-
-; CHECK:.func (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_v4f16(
-; CHECK: .param .align 8 .b8 test_v4f16_param_0[8]
-; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
-; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]];
-; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]];
-; CHECK: .param .align 8 .b8 param0[8];
-; CHECK: st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]};
-; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: call.uni (retval0),
-; CHECK: test_v4f16,
-; CHECK: ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0];
-; CHECK: st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]};
-; CHECK: ret;
-define <4 x half> @test_v4f16(<4 x half> %a) {
- %r = tail call <4 x half> @test_v4f16(<4 x half> %a);
- ret <4 x half> %r;
-}
-
-; CHECK:.func (.param .align 16 .b8 func_retval0[16])
-; CHECK-LABEL: test_v5f16(
-; CHECK: .param .align 16 .b8 test_v5f16_param_0[16]
-; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0];
-; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
-; CHECK-DAG: ld.param.b16 [[E4:%h[0-9]+]], [test_v5f16_param_0+8];
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK-DAG: st.param.v4.b16 [param0+0],
-; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
-; CHECK: .param .align 16 .b8 retval0[16];
-; CHECK: call.uni (retval0),
-; CHECK: test_v5f16,
-; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.b16 [[R4:%h[0-9]+]], [retval0+8];
-; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
-; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]];
-; CHECK: ret;
-define <5 x half> @test_v5f16(<5 x half> %a) {
- %r = tail call <5 x half> @test_v5f16(<5 x half> %a);
- ret <5 x half> %r;
-}
-
-; CHECK:.func (.param .align 16 .b8 func_retval0[16])
-; CHECK-LABEL: test_v8f16(
-; CHECK: .param .align 16 .b8 test_v8f16_param_0[16]
-; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
-; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]];
-; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]];
-; CHECK-DAG: mov.b32 [[HH45:%hh[0-9]+]], [[R45]];
-; CHECK-DAG: mov.b32 [[HH67:%hh[0-9]+]], [[R67]];
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK: st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]};
-; CHECK: .param .align 16 .b8 retval0[16];
-; CHECK: call.uni (retval0),
-; CHECK: test_v8f16,
-; CHECK: ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0];
-; CHECK: st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
-; CHECK: ret;
-define <8 x half> @test_v8f16(<8 x half> %a) {
- %r = tail call <8 x half> @test_v8f16(<8 x half> %a);
- ret <8 x half> %r;
-}
-
-; CHECK:.func (.param .align 32 .b8 func_retval0[32])
-; CHECK-LABEL: test_v9f16(
-; CHECK: .param .align 32 .b8 test_v9f16_param_0[32]
-; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0];
-; CHECK-DAG: ld.param.v4.b16 {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8];
-; CHECK-DAG: ld.param.b16 [[E8:%h[0-9]+]], [test_v9f16_param_0+16];
-; CHECK: .param .align 32 .b8 param0[32];
-; CHECK-DAG: st.param.v4.b16 [param0+0],
-; CHECK-DAG: st.param.v4.b16 [param0+8],
-; CHECK-DAG: st.param.b16 [param0+16], [[E8]];
-; CHECK: .param .align 32 .b8 retval0[32];
-; CHECK: call.uni (retval0),
-; CHECK: test_v9f16,
-; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8];
-; CHECK-DAG: ld.param.b16 [[R8:%h[0-9]+]], [retval0+16];
-; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
-; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]};
-; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]];
-; CHECK: ret;
-define <9 x half> @test_v9f16(<9 x half> %a) {
- %r = tail call <9 x half> @test_v9f16(<9 x half> %a);
- ret <9 x half> %r;
-}
-
-; CHECK: .func (.param .b32 func_retval0)
-; CHECK-LABEL: test_i32(
-; CHECK-NEXT: .param .b32 test_i32_param_0
-; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0];
-; CHECK: .param .b32 param0;
-; CHECK: st.param.b32 [param0+0], [[E]];
-; CHECK: .param .b32 retval0;
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_i32,
-; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0];
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i32 @test_i32(i32 %a) {
- %r = tail call i32 @test_i32(i32 %a);
- ret i32 %r;
-}
-
-; CHECK: .func (.param .align 16 .b8 func_retval0[16])
-; CHECK-LABEL: test_v3i32(
-; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
-; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
-; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
-; CHECK: st.param.b32 [param0+8], [[E2]];
-; CHECK: .param .align 16 .b8 retval0[16];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v3i32,
-; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
-; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
-; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]];
-; CHECK-NEXT: ret;
-define <3 x i32> @test_v3i32(<3 x i32> %a) {
- %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a);
- ret <3 x i32> %r;
-}
-
-; CHECK: .func (.param .align 16 .b8 func_retval0[16])
-; CHECK-LABEL: test_v4i32(
-; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
-; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK: .param .align 16 .b8 retval0[16];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v4i32,
-; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
-; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHCK-NEXT: ret;
-define <4 x i32> @test_v4i32(<4 x i32> %a) {
- %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a);
- ret <4 x i32> %r;
-}
-
-; CHECK: .func (.param .align 32 .b8 func_retval0[32])
-; CHECK-LABEL: test_v5i32(
-; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32]
-; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
-; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
-; CHECK: .param .align 32 .b8 param0[32];
-; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK-DAG: st.param.b32 [param0+16], [[E4]];
-; CHECK: .param .align 32 .b8 retval0[32];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v5i32,
-; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
-; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16];
-; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
-; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]];
-; CHECK-NEXT: ret;
-define <5 x i32> @test_v5i32(<5 x i32> %a) {
- %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a);
- ret <5 x i32> %r;
-}
-
-; CHECK: .func (.param .b32 func_retval0)
-; CHECK-LABEL: test_f32(
-; CHECK-NEXT: .param .b32 test_f32_param_0
-; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0];
-; CHECK: .param .b32 param0;
-; CHECK: st.param.f32 [param0+0], [[E]];
-; CHECK: .param .b32 retval0;
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_f32,
-; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0];
-; CHECK: st.param.f32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define float @test_f32(float %a) {
- %r = tail call float @test_f32(float %a);
- ret float %r;
-}
-
-; CHECK: .func (.param .b64 func_retval0)
-; CHECK-LABEL: test_i64(
-; CHECK-NEXT: .param .b64 test_i64_param_0
-; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0];
-; CHECK: .param .b64 param0;
-; CHECK: st.param.b64 [param0+0], [[E]];
-; CHECK: .param .b64 retval0;
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_i64,
-; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0];
-; CHECK: st.param.b64 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define i64 @test_i64(i64 %a) {
- %r = tail call i64 @test_i64(i64 %a);
- ret i64 %r;
-}
-
-; CHECK: .func (.param .align 32 .b8 func_retval0[32])
-; CHECK-LABEL: test_v3i64(
-; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32]
-; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
-; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
-; CHECK: .param .align 32 .b8 param0[32];
-; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]};
-; CHECK: st.param.b64 [param0+16], [[E2]];
-; CHECK: .param .align 32 .b8 retval0[32];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v3i64,
-; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
-; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16];
-; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]];
-; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]];
-; CHECK-NEXT: ret;
-define <3 x i64> @test_v3i64(<3 x i64> %a) {
- %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a);
- ret <3 x i64> %r;
-}
-
-; For i64 vector loads are limited by PTX to 2 elements.
-; CHECK: .func (.param .align 32 .b8 func_retval0[32])
-; CHECK-LABEL: test_v4i64(
-; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32]
-; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
-; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
-; CHECK: .param .align 32 .b8 param0[32];
-; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]};
-; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]};
-; CHECK: .param .align 32 .b8 retval0[32];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_v4i64,
-; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
-; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
-; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]};
-; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-NEXT: ret;
-define <4 x i64> @test_v4i64(<4 x i64> %a) {
- %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a);
- ret <4 x i64> %r;
-}
-
-; Aggregates, on the other hand, do not get extended.
-
-; CHECK: .func (.param .align 1 .b8 func_retval0[1])
-; CHECK-LABEL: test_s_i1(
-; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
-; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
-; CHECK: .param .align 1 .b8 param0[1];
-; CHECK: st.param.b8 [param0+0], [[A]]
-; CHECK: .param .align 1 .b8 retval0[1];
-; CHECK: call.uni
-; CHECK-NEXT: test_s_i1,
-; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0];
-; CHECK: st.param.b8 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_i1 @test_s_i1(%s_i1 %a) {
- %r = tail call %s_i1 @test_s_i1(%s_i1 %a);
- ret %s_i1 %r;
-}
-
-; CHECK: .func (.param .align 1 .b8 func_retval0[1])
-; CHECK-LABEL: test_s_i8(
-; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
-; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
-; CHECK: .param .align 1 .b8 param0[1];
-; CHECK: st.param.b8 [param0+0], [[A]]
-; CHECK: .param .align 1 .b8 retval0[1];
-; CHECK: call.uni
-; CHECK-NEXT: test_s_i8,
-; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0];
-; CHECK: st.param.b8 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_i8 @test_s_i8(%s_i8 %a) {
- %r = tail call %s_i8 @test_s_i8(%s_i8 %a);
- ret %s_i8 %r;
-}
-
-; CHECK: .func (.param .align 2 .b8 func_retval0[2])
-; CHECK-LABEL: test_s_i16(
-; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
-; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
-; CHECK: .param .align 2 .b8 param0[2];
-; CHECK: st.param.b16 [param0+0], [[A]]
-; CHECK: .param .align 2 .b8 retval0[2];
-; CHECK: call.uni
-; CHECK-NEXT: test_s_i16,
-; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_i16 @test_s_i16(%s_i16 %a) {
- %r = tail call %s_i16 @test_s_i16(%s_i16 %a);
- ret %s_i16 %r;
-}
-
-; CHECK: .func (.param .align 2 .b8 func_retval0[2])
-; CHECK-LABEL: test_s_f16(
-; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
-; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0];
-; CHECK: .param .align 2 .b8 param0[2];
-; CHECK: st.param.b16 [param0+0], [[A]]
-; CHECK: .param .align 2 .b8 retval0[2];
-; CHECK: call.uni
-; CHECK-NEXT: test_s_f16,
-; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
-; CHECK: st.param.b16 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_f16 @test_s_f16(%s_f16 %a) {
- %r = tail call %s_f16 @test_s_f16(%s_f16 %a);
- ret %s_f16 %r;
-}
-
-; CHECK: .func (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_s_i32(
-; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
-; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0];
-; CHECK: .param .align 4 .b8 param0[4]
-; CHECK: st.param.b32 [param0+0], [[E]];
-; CHECK: .param .align 4 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_s_i32,
-; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0];
-; CHECK: st.param.b32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_i32 @test_s_i32(%s_i32 %a) {
- %r = tail call %s_i32 @test_s_i32(%s_i32 %a);
- ret %s_i32 %r;
-}
-
-; CHECK: .func (.param .align 4 .b8 func_retval0[4])
-; CHECK-LABEL: test_s_f32(
-; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
-; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0];
-; CHECK: .param .align 4 .b8 param0[4]
-; CHECK: st.param.f32 [param0+0], [[E]];
-; CHECK: .param .align 4 .b8 retval0[4];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_s_f32,
-; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0];
-; CHECK: st.param.f32 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_f32 @test_s_f32(%s_f32 %a) {
- %r = tail call %s_f32 @test_s_f32(%s_f32 %a);
- ret %s_f32 %r;
-}
-
-; CHECK: .func (.param .align 8 .b8 func_retval0[8])
-; CHECK-LABEL: test_s_i64(
-; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
-; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0];
-; CHECK: .param .align 8 .b8 param0[8];
-; CHECK: st.param.b64 [param0+0], [[E]];
-; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_s_i64,
-; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0];
-; CHECK: st.param.b64 [func_retval0+0], [[R]];
-; CHECK-NEXT: ret;
-define %s_i64 @test_s_i64(%s_i64 %a) {
- %r = tail call %s_i64 @test_s_i64(%s_i64 %a);
- ret %s_i64 %r;
-}
-
-; Fields that have different types, but identical sizes are not vectorized.
-; CHECK: .func (.param .align 8 .b8 func_retval0[24])
-; CHECK-LABEL: test_s_i32f32(
-; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24]
-; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
-; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
-; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
-; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
-; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
-; CHECK: .param .align 8 .b8 param0[24];
-; CHECK-DAG: st.param.b32 [param0+0], [[E0]];
-; CHECK-DAG: st.param.f32 [param0+4], [[E1]];
-; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
-; CHECK-DAG: st.param.f32 [param0+12], [[E3]];
-; CHECK-DAG: st.param.b64 [param0+16], [[E4]];
-; CHECK: .param .align 8 .b8 retval0[24];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_s_i32f32,
-; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0];
-; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4];
-; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
-; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12];
-; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16];
-; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]];
-; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]];
-; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]];
-; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]];
-; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]];
-; CHECK: ret;
-define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
- %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a);
- ret %s_i32f32 %r;
-}
-
-; We do vectorize consecutive fields with matching types.
-; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24])
-; CHECK-LABEL: test_s_i32x4(
-; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24]
-; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16];
-; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
-; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
-; CHECK: .param .align 8 .b8 param0[24];
-; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
-; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
-; CHECK: st.param.b64 [param0+16], [[E4]];
-; CHECK: .param .align 8 .b8 retval0[24];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_s_i32x4,
-; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
-; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
-; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16];
-; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]};
-; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]];
-; CHECK: ret;
-
-define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
- %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a);
- ret %s_i32x4 %r;
-}
-
-; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32])
-; CHECK-LABEL: test_s_i1i32x4(
-; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32]
-; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24];
-; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16];
-; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12];
-; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
-; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
-; CHECK: .param .align 8 .b8 param0[32];
-; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
-; CHECK: st.param.b8 [param0+8], [[E2]];
-; CHECK: st.param.b32 [param0+12], [[E3]];
-; CHECK: st.param.b32 [param0+16], [[E4]];
-; CHECK: st.param.b64 [param0+24], [[E5]];
-; CHECK: .param .align 8 .b8 retval0[32];
-; CHECK: call.uni (retval0),
-; CHECK: test_s_i1i32x4,
-; CHECK: (
-; CHECK: param0
-; CHECK: );
-; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
-; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8];
-; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12];
-; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16];
-; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24];
-; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK: st.param.b8 [func_retval0+8], [[RE2]];
-; CHECK: st.param.b32 [func_retval0+12], [[RE3]];
-; CHECK: st.param.b32 [func_retval0+16], [[RE4]];
-; CHECK: st.param.b64 [func_retval0+24], [[RE5]];
-; CHECK: ret;
-
-define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
- %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a);
- ret %s_i8i32x4 %r;
-}
-
-; -- All loads/stores from parameters aligned by one must be done one
-; -- byte at a time.
-; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25])
-; CHECK-LABEL: test_s_i1i32x4p(
-; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25]
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1];
-; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0];
-; --- TODO
-; --- Unaligned parameter store/ return value load is broken in both nvcc
-; --- and llvm and needs to be fixed.
-; CHECK: .param .align 1 .b8 param0[25];
-; CHECK-DAG: st.param.b32 [param0+0],
-; CHECK-DAG: st.param.b32 [param0+4],
-; CHECK-DAG: st.param.b8 [param0+8],
-; CHECK-DAG: st.param.b32 [param0+9],
-; CHECK-DAG: st.param.b32 [param0+13],
-; CHECK-DAG: st.param.b64 [param0+17],
-; CHECK: .param .align 1 .b8 retval0[25];
-; CHECK: call.uni (retval0),
-; CHECK-NEXT: test_s_i1i32x4p,
-; CHECK-DAG: ld.param.b32 %r41, [retval0+0];
-; CHECK-DAG: ld.param.b32 %r42, [retval0+4];
-; CHECK-DAG: ld.param.b8 %rs2, [retval0+8];
-; CHECK-DAG: ld.param.b32 %r43, [retval0+9];
-; CHECK-DAG: ld.param.b32 %r44, [retval0+13];
-; CHECK-DAG: ld.param.b64 %rd23, [retval0+17];
-; CHECK-DAG: st.param.b32 [func_retval0+0],
-; CHECK-DAG: st.param.b32 [func_retval0+4],
-; CHECK-DAG: st.param.b8 [func_retval0+8],
-; CHECK-DAG: st.param.b32 [func_retval0+9],
-; CHECK-DAG: st.param.b32 [func_retval0+13],
-; CHECK-DAG: st.param.b64 [func_retval0+17],
-
-define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
- %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
- ret %s_i8i32x4p %r;
-}
-
-; Check that we can vectorize loads that span multiple aggregate fields.
-; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80])
-; CHECK-LABEL: test_s_crossfield(
-; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80]
-; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64];
-; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48];
-; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32];
-; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16];
-; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
-; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
-; CHECK: .param .align 16 .b8 param0[80];
-; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
-; CHECK: st.param.b32 [param0+8], [[E2]];
-; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
-; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
-; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
-; CHECK: st.param.b32 [param0+64], [[E15]];
-; CHECK: .param .align 16 .b8 retval0[80];
-; CHECK: call.uni (retval0),
-; CHECK: test_s_crossfield,
-; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
-; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
-; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16];
-; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32];
-; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48];
-; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64];
-; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
-; CHECK: st.param.b32 [func_retval0+8], [[RE2]];
-; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]};
-; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]};
-; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]};
-; CHECK: st.param.b32 [func_retval0+64], [[RE15]];
-; CHECK: ret;
-
-define %s_crossfield @test_s_crossfield(%s_crossfield %a) {
- %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a);
- ret %s_crossfield %r;
-}
+; Verifies correctness of load/store of parameters and return values.
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck %s
+
+%s_i1 = type { i1 }
+%s_i8 = type { i8 }
+%s_i16 = type { i16 }
+%s_f16 = type { half }
+%s_i32 = type { i32 }
+%s_f32 = type { float }
+%s_i64 = type { i64 }
+%s_f64 = type { double }
+
+; More complicated types. i64 is used to increase natural alignment
+; requirement for the type.
+%s_i32x4 = type { i32, i32, i32, i32, i64}
+%s_i32f32 = type { i32, float, i32, float, i64}
+%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64}
+%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}>
+%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]}
+; All scalar parameters must be at least 32 bits in size.
+; i1 is loaded/stored as i8.
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i1(
+; CHECK-NEXT: .param .b32 test_i1_param_0
+; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0];
+; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1;
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[A]]
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni
+; CHECK-NEXT: test_i1,
+; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0];
+; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK: ret;
+define i1 @test_i1(i1 %a) {
+ %r = tail call i1 @test_i1(i1 %a);
+ ret i1 %r;
+}
+
+; Signed i1 is a somewhat special case. We only care about one bit and
+; then us neg.s32 to convert it to 32-bit -1 if it's set.
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i1s(
+; CHECK-NEXT: .param .b32 test_i1s_param_0
+; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
+; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
+; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1;
+; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[A]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni
+; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0];
+; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1;
+; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i1 @test_i1s(i1 signext %a) {
+ %r = tail call signext i1 @test_i1s(i1 signext %a);
+ ret i1 %r;
+}
+
+; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment.
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v3i1(
+; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4]
+; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
+; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0]
+; CHECK: .param .align 4 .b8 param0[4];
+; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b8 [param0+2], [[E2]];
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v3i1,
+; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
+; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}
+; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i1> @test_v3i1(<3 x i1> %a) {
+ %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a);
+ ret <3 x i1> %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v4i1(
+; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4]
+; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0]
+; CHECK: .param .align 4 .b8 param0[4];
+; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK: test_v4i1,
+; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]};
+; CHECK-NEXT: ret;
+define <4 x i1> @test_v4i1(<4 x i1> %a) {
+ %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a);
+ ret <4 x i1> %r;
+}
+
+; CHECK: .func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v5i1(
+; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8]
+; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
+; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0]
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v5i1,
+; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i1> @test_v5i1(<5 x i1> %a) {
+ %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a);
+ ret <5 x i1> %r;
+}
+
+; Unsigned i8 is loaded directly into 32-bit register.
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i8(
+; CHECK-NEXT: .param .b32 test_i8_param_0
+; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0];
+; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]];
+; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255;
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[A]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK: test_i8,
+; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0];
+; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i8 @test_i8(i8 %a) {
+ %r = tail call i8 @test_i8(i8 %a);
+ ret i8 %r;
+}
+
+; signed i8 is loaded into 16-bit register which is then sign-extended to i32.
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i8s(
+; CHECK-NEXT: .param .b32 test_i8s_param_0
+; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0];
+; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[A]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK: test_i8s,
+; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0];
+; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
+; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]];
+; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i8 @test_i8s(i8 signext %a) {
+ %r = tail call signext i8 @test_i8s(i8 signext %a);
+ ret i8 %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v3i8(
+; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
+; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2];
+; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0];
+; CHECK: .param .align 4 .b8 param0[4];
+; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b8 [param0+2], [[E2]];
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v3i8,
+; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2];
+; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i8> @test_v3i8(<3 x i8> %a) {
+ %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a);
+ ret <3 x i8> %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v4i8(
+; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
+; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0]
+; CHECK: .param .align 4 .b8 param0[4];
+; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v4i8,
+; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-NEXT: ret;
+define <4 x i8> @test_v4i8(<4 x i8> %a) {
+ %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a);
+ ret <4 x i8> %r;
+}
+
+; CHECK: .func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v5i8(
+; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
+; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
+; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0]
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG: st.param.b8 [param0+4], [[E4]];
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v5i8,
+; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i8> @test_v5i8(<5 x i8> %a) {
+ %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a);
+ ret <5 x i8> %r;
+}
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i16(
+; CHECK-NEXT: .param .b32 test_i16_param_0
+; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0];
+; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[E32]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_i16,
+; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0];
+; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535;
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i16 @test_i16(i16 %a) {
+ %r = tail call i16 @test_i16(i16 %a);
+ ret i16 %r;
+}
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i16s(
+; CHECK-NEXT: .param .b32 test_i16s_param_0
+; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0];
+; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[E32]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_i16s,
+; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0];
+; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i16 @test_i16s(i16 signext %a) {
+ %r = tail call signext i16 @test_i16s(i16 signext %a);
+ ret i16 %r;
+}
+
+; CHECK: .func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v3i16(
+; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
+; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
+; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0];
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b16 [param0+4], [[E2]];
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v3i16,
+; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i16> @test_v3i16(<3 x i16> %a) {
+ %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a);
+ ret <3 x i16> %r;
+}
+
+; CHECK: .func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v4i16(
+; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
+; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0]
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v4i16,
+; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-NEXT: ret;
+define <4 x i16> @test_v4i16(<4 x i16> %a) {
+ %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a);
+ ret <4 x i16> %r;
+}
+
+; CHECK: .func (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v5i16(
+; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
+; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
+; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v5i16,
+; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8];
+; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i16> @test_v5i16(<5 x i16> %a) {
+ %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a);
+ ret <5 x i16> %r;
+}
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_f16(
+; CHECK-NEXT: .param .b32 test_f16_param_0
+; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_f16_param_0];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b16 [param0+0], [[E]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_f16,
+; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
+; CHECK: st.param.b16 [func_retval0+0], [[R]]
+; CHECK-NEXT: ret;
+define half @test_f16(half %a) {
+ %r = tail call half @test_f16(half %a);
+ ret half %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v2f16(
+; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
+; CHECK: ld.param.b32 [[E:%hh[0-9]+]], [test_v2f16_param_0];
+; CHECK: .param .align 4 .b8 param0[4];
+; CHECK: st.param.b32 [param0+0], [[E]];
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v2f16,
+; CHECK: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
+; CHECK: st.param.b32 [func_retval0+0], [[R]]
+; CHECK-NEXT: ret;
+define <2 x half> @test_v2f16(<2 x half> %a) {
+ %r = tail call <2 x half> @test_v2f16(<2 x half> %a);
+ ret <2 x half> %r;
+}
+
+; CHECK:.func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v3f16(
+; CHECK: .param .align 8 .b8 test_v3f16_param_0[8]
+; CHECK-DAG: ld.param.b32 [[HH01:%hh[0-9]+]], [test_v3f16_param_0];
+; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
+; CHECK-DAG: ld.param.b16 [[E2:%h[0-9]+]], [test_v3f16_param_0+4];
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK-DAG: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b16 [param0+4], [[E2]];
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK: test_v3f16,
+; CHECK-DAG: ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b16 [[R2:%h[0-9]+]], [retval0+4];
+; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]];
+; CHECK: ret;
+define <3 x half> @test_v3f16(<3 x half> %a) {
+ %r = tail call <3 x half> @test_v3f16(<3 x half> %a);
+ ret <3 x half> %r;
+}
+
+; CHECK:.func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v4f16(
+; CHECK: .param .align 8 .b8 test_v4f16_param_0[8]
+; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
+; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]];
+; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]];
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK: st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]};
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK: test_v4f16,
+; CHECK: ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]};
+; CHECK: ret;
+define <4 x half> @test_v4f16(<4 x half> %a) {
+ %r = tail call <4 x half> @test_v4f16(<4 x half> %a);
+ ret <4 x half> %r;
+}
+
+; CHECK:.func (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v5f16(
+; CHECK: .param .align 16 .b8 test_v5f16_param_0[16]
+; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0];
+; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
+; CHECK-DAG: ld.param.b16 [[E4:%h[0-9]+]], [test_v5f16_param_0+8];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v4.b16 [param0+0],
+; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: call.uni (retval0),
+; CHECK: test_v5f16,
+; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b16 [[R4:%h[0-9]+]], [retval0+8];
+; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
+; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]];
+; CHECK: ret;
+define <5 x half> @test_v5f16(<5 x half> %a) {
+ %r = tail call <5 x half> @test_v5f16(<5 x half> %a);
+ ret <5 x half> %r;
+}
+
+; CHECK:.func (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v8f16(
+; CHECK: .param .align 16 .b8 test_v8f16_param_0[16]
+; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
+; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]];
+; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]];
+; CHECK-DAG: mov.b32 [[HH45:%hh[0-9]+]], [[R45]];
+; CHECK-DAG: mov.b32 [[HH67:%hh[0-9]+]], [[R67]];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK: st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]};
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: call.uni (retval0),
+; CHECK: test_v8f16,
+; CHECK: ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
+; CHECK: ret;
+define <8 x half> @test_v8f16(<8 x half> %a) {
+ %r = tail call <8 x half> @test_v8f16(<8 x half> %a);
+ ret <8 x half> %r;
+}
+
+; CHECK:.func (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v9f16(
+; CHECK: .param .align 32 .b8 test_v9f16_param_0[32]
+; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0];
+; CHECK-DAG: ld.param.v4.b16 {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8];
+; CHECK-DAG: ld.param.b16 [[E8:%h[0-9]+]], [test_v9f16_param_0+16];
+; CHECK: .param .align 32 .b8 param0[32];
+; CHECK-DAG: st.param.v4.b16 [param0+0],
+; CHECK-DAG: st.param.v4.b16 [param0+8],
+; CHECK-DAG: st.param.b16 [param0+16], [[E8]];
+; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK: call.uni (retval0),
+; CHECK: test_v9f16,
+; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8];
+; CHECK-DAG: ld.param.b16 [[R8:%h[0-9]+]], [retval0+16];
+; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
+; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]};
+; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]];
+; CHECK: ret;
+define <9 x half> @test_v9f16(<9 x half> %a) {
+ %r = tail call <9 x half> @test_v9f16(<9 x half> %a);
+ ret <9 x half> %r;
+}
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_i32(
+; CHECK-NEXT: .param .b32 test_i32_param_0
+; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.b32 [param0+0], [[E]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_i32,
+; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i32 @test_i32(i32 %a) {
+ %r = tail call i32 @test_i32(i32 %a);
+ ret i32 %r;
+}
+
+; CHECK: .func (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v3i32(
+; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
+; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b32 [param0+8], [[E2]];
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v3i32,
+; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i32> @test_v3i32(<3 x i32> %a) {
+ %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a);
+ ret <3 x i32> %r;
+}
+
+; CHECK: .func (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v4i32(
+; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
+; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v4i32,
+; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
+; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHCK-NEXT: ret;
+define <4 x i32> @test_v4i32(<4 x i32> %a) {
+ %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a);
+ ret <4 x i32> %r;
+}
+
+; CHECK: .func (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v5i32(
+; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32]
+; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
+; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
+; CHECK: .param .align 32 .b8 param0[32];
+; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG: st.param.b32 [param0+16], [[E4]];
+; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v5i32,
+; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16];
+; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i32> @test_v5i32(<5 x i32> %a) {
+ %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a);
+ ret <5 x i32> %r;
+}
+
+; CHECK: .func (.param .b32 func_retval0)
+; CHECK-LABEL: test_f32(
+; CHECK-NEXT: .param .b32 test_f32_param_0
+; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0];
+; CHECK: .param .b32 param0;
+; CHECK: st.param.f32 [param0+0], [[E]];
+; CHECK: .param .b32 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_f32,
+; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0];
+; CHECK: st.param.f32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define float @test_f32(float %a) {
+ %r = tail call float @test_f32(float %a);
+ ret float %r;
+}
+
+; CHECK: .func (.param .b64 func_retval0)
+; CHECK-LABEL: test_i64(
+; CHECK-NEXT: .param .b64 test_i64_param_0
+; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0];
+; CHECK: .param .b64 param0;
+; CHECK: st.param.b64 [param0+0], [[E]];
+; CHECK: .param .b64 retval0;
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_i64,
+; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0];
+; CHECK: st.param.b64 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i64 @test_i64(i64 %a) {
+ %r = tail call i64 @test_i64(i64 %a);
+ ret i64 %r;
+}
+
+; CHECK: .func (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v3i64(
+; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32]
+; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
+; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
+; CHECK: .param .align 32 .b8 param0[32];
+; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b64 [param0+16], [[E2]];
+; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v3i64,
+; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]];
+; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i64> @test_v3i64(<3 x i64> %a) {
+ %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a);
+ ret <3 x i64> %r;
+}
+
+; For i64 vector loads are limited by PTX to 2 elements.
+; CHECK: .func (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v4i64(
+; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32]
+; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
+; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
+; CHECK: .param .align 32 .b8 param0[32];
+; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]};
+; CHECK: .param .align 32 .b8 retval0[32];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_v4i64,
+; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
+; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]};
+; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-NEXT: ret;
+define <4 x i64> @test_v4i64(<4 x i64> %a) {
+ %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a);
+ ret <4 x i64> %r;
+}
+
+; Aggregates, on the other hand, do not get extended.
+
+; CHECK: .func (.param .align 1 .b8 func_retval0[1])
+; CHECK-LABEL: test_s_i1(
+; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
+; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
+; CHECK: .param .align 1 .b8 param0[1];
+; CHECK: st.param.b8 [param0+0], [[A]]
+; CHECK: .param .align 1 .b8 retval0[1];
+; CHECK: call.uni
+; CHECK-NEXT: test_s_i1,
+; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0];
+; CHECK: st.param.b8 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i1 @test_s_i1(%s_i1 %a) {
+ %r = tail call %s_i1 @test_s_i1(%s_i1 %a);
+ ret %s_i1 %r;
+}
+
+; CHECK: .func (.param .align 1 .b8 func_retval0[1])
+; CHECK-LABEL: test_s_i8(
+; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
+; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
+; CHECK: .param .align 1 .b8 param0[1];
+; CHECK: st.param.b8 [param0+0], [[A]]
+; CHECK: .param .align 1 .b8 retval0[1];
+; CHECK: call.uni
+; CHECK-NEXT: test_s_i8,
+; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0];
+; CHECK: st.param.b8 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i8 @test_s_i8(%s_i8 %a) {
+ %r = tail call %s_i8 @test_s_i8(%s_i8 %a);
+ ret %s_i8 %r;
+}
+
+; CHECK: .func (.param .align 2 .b8 func_retval0[2])
+; CHECK-LABEL: test_s_i16(
+; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
+; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
+; CHECK: .param .align 2 .b8 param0[2];
+; CHECK: st.param.b16 [param0+0], [[A]]
+; CHECK: .param .align 2 .b8 retval0[2];
+; CHECK: call.uni
+; CHECK-NEXT: test_s_i16,
+; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i16 @test_s_i16(%s_i16 %a) {
+ %r = tail call %s_i16 @test_s_i16(%s_i16 %a);
+ ret %s_i16 %r;
+}
+
+; CHECK: .func (.param .align 2 .b8 func_retval0[2])
+; CHECK-LABEL: test_s_f16(
+; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
+; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0];
+; CHECK: .param .align 2 .b8 param0[2];
+; CHECK: st.param.b16 [param0+0], [[A]]
+; CHECK: .param .align 2 .b8 retval0[2];
+; CHECK: call.uni
+; CHECK-NEXT: test_s_f16,
+; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
+; CHECK: st.param.b16 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_f16 @test_s_f16(%s_f16 %a) {
+ %r = tail call %s_f16 @test_s_f16(%s_f16 %a);
+ ret %s_f16 %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_s_i32(
+; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
+; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0];
+; CHECK: .param .align 4 .b8 param0[4]
+; CHECK: st.param.b32 [param0+0], [[E]];
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_i32,
+; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0];
+; CHECK: st.param.b32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i32 @test_s_i32(%s_i32 %a) {
+ %r = tail call %s_i32 @test_s_i32(%s_i32 %a);
+ ret %s_i32 %r;
+}
+
+; CHECK: .func (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_s_f32(
+; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
+; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0];
+; CHECK: .param .align 4 .b8 param0[4]
+; CHECK: st.param.f32 [param0+0], [[E]];
+; CHECK: .param .align 4 .b8 retval0[4];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_f32,
+; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0];
+; CHECK: st.param.f32 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_f32 @test_s_f32(%s_f32 %a) {
+ %r = tail call %s_f32 @test_s_f32(%s_f32 %a);
+ ret %s_f32 %r;
+}
+
+; CHECK: .func (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_s_i64(
+; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
+; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0];
+; CHECK: .param .align 8 .b8 param0[8];
+; CHECK: st.param.b64 [param0+0], [[E]];
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_i64,
+; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0];
+; CHECK: st.param.b64 [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i64 @test_s_i64(%s_i64 %a) {
+ %r = tail call %s_i64 @test_s_i64(%s_i64 %a);
+ ret %s_i64 %r;
+}
+
+; Fields that have different types, but identical sizes are not vectorized.
+; CHECK: .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i32f32(
+; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24]
+; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
+; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
+; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
+; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
+; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
+; CHECK: .param .align 8 .b8 param0[24];
+; CHECK-DAG: st.param.b32 [param0+0], [[E0]];
+; CHECK-DAG: st.param.f32 [param0+4], [[E1]];
+; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
+; CHECK-DAG: st.param.f32 [param0+12], [[E3]];
+; CHECK-DAG: st.param.b64 [param0+16], [[E4]];
+; CHECK: .param .align 8 .b8 retval0[24];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_i32f32,
+; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0];
+; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4];
+; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12];
+; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]];
+; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]];
+; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]];
+; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]];
+; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]];
+; CHECK: ret;
+define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
+ %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a);
+ ret %s_i32f32 %r;
+}
+
+; We do vectorize consecutive fields with matching types.
+; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i32x4(
+; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24]
+; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16];
+; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
+; CHECK: .param .align 8 .b8 param0[24];
+; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
+; CHECK: st.param.b64 [param0+16], [[E4]];
+; CHECK: .param .align 8 .b8 retval0[24];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_i32x4,
+; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
+; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]};
+; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]];
+; CHECK: ret;
+
+define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
+ %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a);
+ ret %s_i32x4 %r;
+}
+
+; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i1i32x4(
+; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32]
+; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24];
+; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16];
+; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12];
+; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
+; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
+; CHECK: .param .align 8 .b8 param0[32];
+; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b8 [param0+8], [[E2]];
+; CHECK: st.param.b32 [param0+12], [[E3]];
+; CHECK: st.param.b32 [param0+16], [[E4]];
+; CHECK: st.param.b64 [param0+24], [[E5]];
+; CHECK: .param .align 8 .b8 retval0[32];
+; CHECK: call.uni (retval0),
+; CHECK: test_s_i1i32x4,
+; CHECK: (
+; CHECK: param0
+; CHECK: );
+; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8];
+; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12];
+; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16];
+; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK: st.param.b8 [func_retval0+8], [[RE2]];
+; CHECK: st.param.b32 [func_retval0+12], [[RE3]];
+; CHECK: st.param.b32 [func_retval0+16], [[RE4]];
+; CHECK: st.param.b64 [func_retval0+24], [[RE5]];
+; CHECK: ret;
+
+define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
+ %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a);
+ ret %s_i8i32x4 %r;
+}
+
+; -- All loads/stores from parameters aligned by one must be done one
+; -- byte at a time.
+; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25])
+; CHECK-LABEL: test_s_i1i32x4p(
+; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25]
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1];
+; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0];
+; --- TODO
+; --- Unaligned parameter store/ return value load is broken in both nvcc
+; --- and llvm and needs to be fixed.
+; CHECK: .param .align 1 .b8 param0[25];
+; CHECK-DAG: st.param.b32 [param0+0],
+; CHECK-DAG: st.param.b32 [param0+4],
+; CHECK-DAG: st.param.b8 [param0+8],
+; CHECK-DAG: st.param.b32 [param0+9],
+; CHECK-DAG: st.param.b32 [param0+13],
+; CHECK-DAG: st.param.b64 [param0+17],
+; CHECK: .param .align 1 .b8 retval0[25];
+; CHECK: call.uni (retval0),
+; CHECK-NEXT: test_s_i1i32x4p,
+; CHECK-DAG: ld.param.b32 %r41, [retval0+0];
+; CHECK-DAG: ld.param.b32 %r42, [retval0+4];
+; CHECK-DAG: ld.param.b8 %rs2, [retval0+8];
+; CHECK-DAG: ld.param.b32 %r43, [retval0+9];
+; CHECK-DAG: ld.param.b32 %r44, [retval0+13];
+; CHECK-DAG: ld.param.b64 %rd23, [retval0+17];
+; CHECK-DAG: st.param.b32 [func_retval0+0],
+; CHECK-DAG: st.param.b32 [func_retval0+4],
+; CHECK-DAG: st.param.b8 [func_retval0+8],
+; CHECK-DAG: st.param.b32 [func_retval0+9],
+; CHECK-DAG: st.param.b32 [func_retval0+13],
+; CHECK-DAG: st.param.b64 [func_retval0+17],
+
+define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
+ %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
+ ret %s_i8i32x4p %r;
+}
+
+; Check that we can vectorize loads that span multiple aggregate fields.
+; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80])
+; CHECK-LABEL: test_s_crossfield(
+; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80]
+; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64];
+; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48];
+; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32];
+; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16];
+; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
+; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
+; CHECK: .param .align 16 .b8 param0[80];
+; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK: st.param.b32 [param0+8], [[E2]];
+; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
+; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
+; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
+; CHECK: st.param.b32 [param0+64], [[E15]];
+; CHECK: .param .align 16 .b8 retval0[80];
+; CHECK: call.uni (retval0),
+; CHECK: test_s_crossfield,
+; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16];
+; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32];
+; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48];
+; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64];
+; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK: st.param.b32 [func_retval0+8], [[RE2]];
+; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]};
+; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]};
+; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]};
+; CHECK: st.param.b32 [func_retval0+64], [[RE15]];
+; CHECK: ret;
+
+define %s_crossfield @test_s_crossfield(%s_crossfield %a) {
+ %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a);
+ ret %s_crossfield %r;
+}
diff --git a/test/CodeGen/NVPTX/sched1.ll b/test/CodeGen/NVPTX/sched1.ll
index fb01eb262adc..ecdf55ecdbeb 100644
--- a/test/CodeGen/NVPTX/sched1.ll
+++ b/test/CodeGen/NVPTX/sched1.ll
@@ -6,10 +6,10 @@ define void @foo(i32* %a) {
; CHECK: .func foo
; CHECK: ld.u32
; CHECK-NEXT: ld.u32
-; CHECK-NEXT: ld.u32
-; CHECK-NEXT: ld.u32
; CHECK-NEXT: add.s32
+; CHECK-NEXT: ld.u32
; CHECK-NEXT: add.s32
+; CHECK-NEXT: ld.u32
; CHECK-NEXT: add.s32
%ptr0 = getelementptr i32, i32* %a, i32 0
%val0 = load i32, i32* %ptr0
diff --git a/test/CodeGen/NVPTX/sched2.ll b/test/CodeGen/NVPTX/sched2.ll
index 91ed77878f81..347f77c5682c 100644
--- a/test/CodeGen/NVPTX/sched2.ll
+++ b/test/CodeGen/NVPTX/sched2.ll
@@ -4,12 +4,12 @@ define void @foo(<2 x i32>* %a) {
; CHECK: .func foo
; CHECK: ld.v2.u32
; CHECK-NEXT: ld.v2.u32
-; CHECK-NEXT: ld.v2.u32
-; CHECK-NEXT: ld.v2.u32
; CHECK-NEXT: add.s32
; CHECK-NEXT: add.s32
+; CHECK-NEXT: ld.v2.u32
; CHECK-NEXT: add.s32
; CHECK-NEXT: add.s32
+; CHECK-NEXT: ld.v2.u32
; CHECK-NEXT: add.s32
; CHECK-NEXT: add.s32
%ptr0 = getelementptr <2 x i32>, <2 x i32>* %a, i32 0
diff --git a/test/CodeGen/NVPTX/simple-call.ll b/test/CodeGen/NVPTX/simple-call.ll
index da6568685fe6..8ff0b5da5bcc 100644
--- a/test/CodeGen/NVPTX/simple-call.ll
+++ b/test/CodeGen/NVPTX/simple-call.ll
@@ -1,26 +1,26 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
-
-
-
-; CHECK: .func ({{.*}}) device_func
-define float @device_func(float %a) noinline {
- %ret = fmul float %a, %a
- ret float %ret
-}
-
-; CHECK: .entry kernel_func
-define void @kernel_func(float* %a) {
- %val = load float, float* %a
-; CHECK: call.uni (retval0),
-; CHECK: device_func,
- %mul = call float @device_func(float %val)
- store float %mul, float* %a
- ret void
-}
-
-
-
-!nvvm.annotations = !{!1}
-
-!1 = !{void (float*)* @kernel_func, !"kernel", i32 1}
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+
+
+; CHECK: .func ({{.*}}) device_func
+define float @device_func(float %a) noinline {
+ %ret = fmul float %a, %a
+ ret float %ret
+}
+
+; CHECK: .entry kernel_func
+define void @kernel_func(float* %a) {
+ %val = load float, float* %a
+; CHECK: call.uni (retval0),
+; CHECK: device_func,
+ %mul = call float @device_func(float %val)
+ store float %mul, float* %a
+ ret void
+}
+
+
+
+!nvvm.annotations = !{!1}
+
+!1 = !{void (float*)* @kernel_func, !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll
index a86ba1e29d5c..93b39c1125f8 100644
--- a/test/CodeGen/NVPTX/vec8.ll
+++ b/test/CodeGen/NVPTX/vec8.ll
@@ -7,7 +7,7 @@ define void @foo(<8 x i8> %a, i8* %b) {
; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1]
-; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]];
+; CHECK-DAG: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]];
; CHECK: st.u8 [%[[B]]], [[T]];
%t0 = extractelement <8 x i8> %a, i32 1
%t1 = extractelement <8 x i8> %a, i32 6
diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll
index bf7b931a5758..d1ec8d25a107 100644
--- a/test/CodeGen/NVPTX/vector-call.ll
+++ b/test/CodeGen/NVPTX/vector-call.ll
@@ -1,30 +1,30 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-
-target triple = "nvptx-unknown-cuda"
-
-declare void @bar(<4 x i32>)
-
-; CHECK-LABEL: .func foo(
-; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
-; CHECK: call.uni
-; CHECK: ret;
-define void @foo(<4 x i32> %a) {
- tail call void @bar(<4 x i32> %a)
- ret void
-}
-
-; CHECK-LABEL: .func foo3(
-; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
-; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
-; CHECK: .param .align 16 .b8 param0[16];
-; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
-; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
-; CHECK: call.uni
-; CHECK: ret;
-declare void @bar3(<3 x i32>)
-define void @foo3(<3 x i32> %a) {
- tail call void @bar3(<3 x i32> %a)
- ret void
-}
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
+
+target triple = "nvptx-unknown-cuda"
+
+declare void @bar(<4 x i32>)
+
+; CHECK-LABEL: .func foo(
+; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK: call.uni
+; CHECK: ret;
+define void @foo(<4 x i32> %a) {
+ tail call void @bar(<4 x i32> %a)
+ ret void
+}
+
+; CHECK-LABEL: .func foo3(
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
+; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
+; CHECK: call.uni
+; CHECK: ret;
+declare void @bar3(<3 x i32>)
+define void @foo3(<3 x i32> %a) {
+ tail call void @bar3(<3 x i32> %a)
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/zeroext-32bit.ll b/test/CodeGen/NVPTX/zeroext-32bit.ll
index c2f0ec4b1447..bcfd987b4a66 100644
--- a/test/CodeGen/NVPTX/zeroext-32bit.ll
+++ b/test/CodeGen/NVPTX/zeroext-32bit.ll
@@ -1,26 +1,26 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
-
-; The zeroext attribute below should be silently ignored because
-; we can pass a 32-bit integer across a function call without
-; needing to extend it.
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-target triple = "nvptx64-unknown-cuda"
-
-; CHECK-LABEL: .visible .func zeroext_test
-; CHECK-NOT: cvt.u32.u16
-define void @zeroext_test() {
- tail call void @call1(i32 zeroext 0)
- ret void
-}
-
-declare void @call1(i32 zeroext)
-
-; CHECK-LABEL: .visible .func signext_test
-; CHECK-NOT: cvt.s32.s16
-define void @signext_test() {
- tail call void @call2(i32 zeroext 0)
- ret void
-}
-
-declare void @call2(i32 zeroext)
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s
+
+; The zeroext attribute below should be silently ignored because
+; we can pass a 32-bit integer across a function call without
+; needing to extend it.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-cuda"
+
+; CHECK-LABEL: .visible .func zeroext_test
+; CHECK-NOT: cvt.u32.u16
+define void @zeroext_test() {
+ tail call void @call1(i32 zeroext 0)
+ ret void
+}
+
+declare void @call1(i32 zeroext)
+
+; CHECK-LABEL: .visible .func signext_test
+; CHECK-NOT: cvt.s32.s16
+define void @signext_test() {
+ tail call void @call2(i32 zeroext 0)
+ ret void
+}
+
+declare void @call2(i32 zeroext)
diff --git a/test/CodeGen/PowerPC/mtvsrdd.ll b/test/CodeGen/PowerPC/mtvsrdd.ll
new file mode 100644
index 000000000000..1d6a3553b2a1
--- /dev/null
+++ b/test/CodeGen/PowerPC/mtvsrdd.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=pwr9 -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
+; RUN: < %s | FileCheck %s
+
+; This test case checks r0 is used as constant 0 in instruction mtvsrdd.
+
+define <2 x i64> @const0(i64 %a) {
+ %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
+ %vecinit1 = insertelement <2 x i64> %vecinit, i64 0, i32 1
+ ret <2 x i64> %vecinit1
+; CHECK-LABEL: const0
+; CHECK: mtvsrdd v2, 0, r3
+}
+
+define <2 x i64> @noconst0(i64* %a, i64* %b) {
+ %1 = load i64, i64* %a, align 8
+ %2 = load i64, i64* %b, align 8
+ %vecinit = insertelement <2 x i64> undef, i64 %2, i32 0
+ %vecinit1 = insertelement <2 x i64> %vecinit, i64 %1, i32 1
+ ret <2 x i64> %vecinit1
+; CHECK-LABEL: noconst0
+; CHECK: mtvsrdd v2, {{r[0-9]+}}, {{r[0-9]+}}
+}
diff --git a/test/CodeGen/PowerPC/setcc-logic.ll b/test/CodeGen/PowerPC/setcc-logic.ll
index 2ed08e2ae380..a5a86f101a94 100644
--- a/test/CodeGen/PowerPC/setcc-logic.ll
+++ b/test/CodeGen/PowerPC/setcc-logic.ll
@@ -6,7 +6,7 @@ define zeroext i1 @all_bits_clear(i32 %P, i32 %Q) {
; CHECK: # BB#0:
; CHECK-NEXT: or 3, 3, 4
; CHECK-NEXT: cntlzw 3, 3
-; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31
+; CHECK-NEXT: srwi 3, 3, 5
; CHECK-NEXT: blr
%a = icmp eq i32 %P, 0
%b = icmp eq i32 %Q, 0
@@ -30,11 +30,11 @@ define zeroext i1 @all_sign_bits_clear(i32 %P, i32 %Q) {
define zeroext i1 @all_bits_set(i32 %P, i32 %Q) {
; CHECK-LABEL: all_bits_set:
; CHECK: # BB#0:
+; CHECK-NEXT: li 5, -1
; CHECK-NEXT: and 3, 3, 4
-; CHECK-NEXT: li 5, 0
-; CHECK-NEXT: li 12, 1
-; CHECK-NEXT: cmpwi 0, 3, -1
-; CHECK-NEXT: isel 3, 12, 5, 2
+; CHECK-NEXT: xor 3, 3, 5
+; CHECK-NEXT: cntlzw 3, 3
+; CHECK-NEXT: srwi 3, 3, 5
; CHECK-NEXT: blr
%a = icmp eq i32 %P, -1
%b = icmp eq i32 %Q, -1
@@ -437,7 +437,7 @@ define zeroext i1 @and_eq(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 z
; CHECK-NEXT: xor 3, 3, 4
; CHECK-NEXT: or 3, 3, 5
; CHECK-NEXT: cntlzw 3, 3
-; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31
+; CHECK-NEXT: srwi 3, 3, 5
; CHECK-NEXT: blr
%cmp1 = icmp eq i16 %a, %b
%cmp2 = icmp eq i16 %c, %d
diff --git a/test/CodeGen/PowerPC/stackmap-frame-setup.ll b/test/CodeGen/PowerPC/stackmap-frame-setup.ll
index b5f1d4cfe4bc..b677b8be2966 100644
--- a/test/CodeGen/PowerPC/stackmap-frame-setup.ll
+++ b/test/CodeGen/PowerPC/stackmap-frame-setup.ll
@@ -7,11 +7,11 @@ entry:
store i64 11, i64* %metadata
store i64 12, i64* %metadata
store i64 13, i64* %metadata
-; ISEL: ADJCALLSTACKDOWN 0, implicit-def
+; ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def
; ISEL-NEXT: STACKMAP
; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
-; FAST-ISEL: ADJCALLSTACKDOWN 0, implicit-def
+; FAST-ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def
; FAST-ISEL-NEXT: STACKMAP
; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
ret void
diff --git a/test/CodeGen/PowerPC/tail-dup-layout.ll b/test/CodeGen/PowerPC/tail-dup-layout.ll
index c9b5bf8c9eeb..9665901e874f 100644
--- a/test/CodeGen/PowerPC/tail-dup-layout.ll
+++ b/test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -1,4 +1,5 @@
-; RUN: llc -O2 < %s | FileCheck %s
+; RUN: llc -O2 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O2 %s
+; RUN: llc -O3 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O3 %s
target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-grtev4-linux-gnu"
@@ -99,11 +100,9 @@ exit:
; test1
; test2
; test3
-; test4
; optional1
; optional2
; optional3
-; optional4
; exit
; even for 50/50 branches.
; Tail duplication puts test n+1 at the end of optional n
@@ -163,6 +162,98 @@ exit:
}
; Intended layout:
+; The chain-of-triangles based duplicating produces the layout when 3
+; instructions are allowed for tail-duplication.
+; test1
+; test2
+; test3
+; optional1
+; optional2
+; optional3
+; exit
+;
+; Otherwise it produces the layout:
+; test1
+; optional1
+; test2
+; optional2
+; test3
+; optional3
+; exit
+
+;CHECK-LABEL: straight_test_3_instr_test:
+; test1 may have been merged with entry
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: clrlwi {{[0-9]+}}, [[TAGREG]], 30
+;CHECK-NEXT: cmplwi {{[0-9]+}}, 2
+
+;CHECK-O3-NEXT: bne 0, .[[OPT1LABEL:[_0-9A-Za-z]+]]
+;CHECK-O3-NEXT: # %test2
+;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8
+;CHECK-O3-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
+;CHECK-O3-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
+;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32
+;CHECK-O3-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
+;CHECK-O3-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
+;CHECK-O3: blr
+;CHECK-O3-NEXT: .[[OPT1LABEL]]:
+;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8
+;CHECK-O3-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-O3-NEXT: .[[OPT2LABEL]]:
+;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
+;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32
+;CHECK-O3-NEXT: beq 0, .[[EXITLABEL]]
+;CHECK-O3-NEXT: .[[OPT3LABEL]]:
+;CHECK-O3: b .[[EXITLABEL]]
+
+;CHECK-O2-NEXT: beq 0, .[[TEST2LABEL:[_0-9A-Za-z]+]]
+;CHECK-O2-NEXT: # %optional1
+;CHECK-O2: .[[TEST2LABEL]]: # %test2
+;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
+;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 8
+;CHECK-O2-NEXT: beq 0, .[[TEST3LABEL:[_0-9A-Za-z]+]]
+;CHECK-O2-NEXT: # %optional2
+;CHECK-O2: .[[TEST3LABEL]]: # %test3
+;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
+;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 32
+;CHECK-O2-NEXT: beq 0, .[[EXITLABEL:[_0-9A-Za-z]+]]
+;CHECK-O2-NEXT: # %optional3
+;CHECK-O2: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
+;CHECK-O2: blr
+
+
+define void @straight_test_3_instr_test(i32 %tag) {
+entry:
+ br label %test1
+test1:
+ %tagbit1 = and i32 %tag, 3
+ %tagbit1eq0 = icmp eq i32 %tagbit1, 2
+ br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2
+optional1:
+ call void @a()
+ br label %test2
+test2:
+ %tagbit2 = and i32 %tag, 12
+ %tagbit2eq0 = icmp eq i32 %tagbit2, 8
+ br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2
+optional2:
+ call void @b()
+ br label %test3
+test3:
+ %tagbit3 = and i32 %tag, 48
+ %tagbit3eq0 = icmp eq i32 %tagbit3, 32
+ br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1
+optional3:
+ call void @c()
+ br label %exit
+exit:
+ ret void
+}
+
+; Intended layout:
; The chain-based outlining produces the layout
; entry
; --- Begin loop ---
diff --git a/test/CodeGen/PowerPC/testComparesieqsc.ll b/test/CodeGen/PowerPC/testComparesieqsc.ll
new file mode 100644
index 000000000000..71ad5ed34969
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesieqsc.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesieqsc.c'
+
+@glob = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsc(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_ieqsc:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv2 = zext i1 %cmp to i32
+ ret i32 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsc_sext(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_ieqsc_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsc_z(i8 signext %a) {
+; CHECK-LABEL: test_ieqsc_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv1 = zext i1 %cmp to i32
+ ret i32 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsc_sext_z(i8 signext %a) {
+; CHECK-LABEL: test_ieqsc_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsc_store(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_ieqsc_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stb r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = zext i1 %cmp to i8
+ store i8 %conv3, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsc_sext_store(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_ieqsc_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = sext i1 %cmp to i8
+ store i8 %conv3, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsc_z_store(i8 signext %a) {
+; CHECK-LABEL: test_ieqsc_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = zext i1 %cmp to i8
+ store i8 %conv2, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsc_sext_z_store(i8 signext %a) {
+; CHECK-LABEL: test_ieqsc_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = sext i1 %cmp to i8
+ store i8 %conv2, i8* @glob, align 1
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesieqsi.ll b/test/CodeGen/PowerPC/testComparesieqsi.ll
new file mode 100644
index 000000000000..16882dbd0045
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesieqsi.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesieqsi.c'
+
+@glob = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsi(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_ieqsi:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsi_sext(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_ieqsi_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsi_z(i32 signext %a) {
+; CHECK-LABEL: test_ieqsi_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqsi_sext_z(i32 signext %a) {
+; CHECK-LABEL: test_ieqsi_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsi_store(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_ieqsi_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stw r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsi_sext_store(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_ieqsi_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %sub = sext i1 %cmp to i32
+ store i32 %sub, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsi_z_store(i32 signext %a) {
+; CHECK-LABEL: test_ieqsi_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqsi_sext_z_store(i32 signext %a) {
+; CHECK-LABEL: test_ieqsi_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %sub = sext i1 %cmp to i32
+ store i32 %sub, i32* @glob, align 4
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesieqss.ll b/test/CodeGen/PowerPC/testComparesieqss.ll
new file mode 100644
index 000000000000..110c5a62804e
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesieqss.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesieqss.c'
+
+@glob = common local_unnamed_addr global i16 0, align 2
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqss(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_ieqss:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv2 = zext i1 %cmp to i32
+ ret i32 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqss_sext(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_ieqss_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqss_z(i16 signext %a) {
+; CHECK-LABEL: test_ieqss_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv1 = zext i1 %cmp to i32
+ ret i32 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_ieqss_sext_z(i16 signext %a) {
+; CHECK-LABEL: test_ieqss_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqss_store(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_ieqss_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: sth r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = zext i1 %cmp to i16
+ store i16 %conv3, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqss_sext_store(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_ieqss_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = sext i1 %cmp to i16
+ store i16 %conv3, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqss_z_store(i16 signext %a) {
+; CHECK-LABEL: test_ieqss_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = zext i1 %cmp to i16
+ store i16 %conv2, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_ieqss_sext_z_store(i16 signext %a) {
+; CHECK-LABEL: test_ieqss_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = sext i1 %cmp to i16
+ store i16 %conv2, i16* @glob, align 2
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesiequc.ll b/test/CodeGen/PowerPC/testComparesiequc.ll
new file mode 100644
index 000000000000..e2c975f2c191
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesiequc.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesiequc.c'
+
+@glob = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequc(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_iequc:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv2 = zext i1 %cmp to i32
+ ret i32 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequc_sext(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_iequc_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequc_z(i8 zeroext %a) {
+; CHECK-LABEL: test_iequc_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv1 = zext i1 %cmp to i32
+ ret i32 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequc_sext_z(i8 zeroext %a) {
+; CHECK-LABEL: test_iequc_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequc_store(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_iequc_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stb r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = zext i1 %cmp to i8
+ store i8 %conv3, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequc_sext_store(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_iequc_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = sext i1 %cmp to i8
+ store i8 %conv3, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequc_z_store(i8 zeroext %a) {
+; CHECK-LABEL: test_iequc_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = zext i1 %cmp to i8
+ store i8 %conv2, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequc_sext_z_store(i8 zeroext %a) {
+; CHECK-LABEL: test_iequc_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = sext i1 %cmp to i8
+ store i8 %conv2, i8* @glob, align 1
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesiequi.ll b/test/CodeGen/PowerPC/testComparesiequi.ll
new file mode 100644
index 000000000000..789b176a7700
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesiequi.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesiequi.c'
+
+@glob = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequi(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_iequi:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequi_sext(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_iequi_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequi_z(i32 zeroext %a) {
+; CHECK-LABEL: test_iequi_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequi_sext_z(i32 zeroext %a) {
+; CHECK-LABEL: test_iequi_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequi_store(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_iequi_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stw r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequi_sext_store(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_iequi_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %sub = sext i1 %cmp to i32
+ store i32 %sub, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequi_z_store(i32 zeroext %a) {
+; CHECK-LABEL: test_iequi_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequi_sext_z_store(i32 zeroext %a) {
+; CHECK-LABEL: test_iequi_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %sub = sext i1 %cmp to i32
+ store i32 %sub, i32* @glob, align 4
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesiequs.ll b/test/CodeGen/PowerPC/testComparesiequs.ll
new file mode 100644
index 000000000000..b72943893e98
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesiequs.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testComparesiequs.c'
+
+@glob = common local_unnamed_addr global i16 0, align 2
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequs(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_iequs:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv2 = zext i1 %cmp to i32
+ ret i32 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequs_sext(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_iequs_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequs_z(i16 zeroext %a) {
+; CHECK-LABEL: test_iequs_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv1 = zext i1 %cmp to i32
+ ret i32 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define signext i32 @test_iequs_sext_z(i16 zeroext %a) {
+; CHECK-LABEL: test_iequs_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %sub = sext i1 %cmp to i32
+ ret i32 %sub
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequs_store(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_iequs_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: sth r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = zext i1 %cmp to i16
+ store i16 %conv3, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequs_sext_store(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_iequs_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = sext i1 %cmp to i16
+ store i16 %conv3, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequs_z_store(i16 zeroext %a) {
+; CHECK-LABEL: test_iequs_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = zext i1 %cmp to i16
+ store i16 %conv2, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_iequs_sext_z_store(i16 zeroext %a) {
+; CHECK-LABEL: test_iequs_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = sext i1 %cmp to i16
+ store i16 %conv2, i16* @glob, align 2
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testCompareslleqsc.ll b/test/CodeGen/PowerPC/testCompareslleqsc.ll
new file mode 100644
index 000000000000..56af12827931
--- /dev/null
+++ b/test/CodeGen/PowerPC/testCompareslleqsc.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; ModuleID = 'ComparisonTestCases/testCompareslleqsc.c'
+
+@glob = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsc(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_lleqsc:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = zext i1 %cmp to i64
+ ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsc_sext(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_lleqsc_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = sext i1 %cmp to i64
+ ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsc_z(i8 signext %a) {
+; CHECK-LABEL: test_lleqsc_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = zext i1 %cmp to i64
+ ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsc_sext_z(i8 signext %a) {
+; CHECK-LABEL: test_lleqsc_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = sext i1 %cmp to i64
+ ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsc_store(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_lleqsc_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stb r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = zext i1 %cmp to i8
+ store i8 %conv3, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsc_sext_store(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: test_lleqsc_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = sext i1 %cmp to i8
+ store i8 %conv3, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsc_z_store(i8 signext %a) {
+; CHECK-LABEL: test_lleqsc_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = zext i1 %cmp to i8
+ store i8 %conv2, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsc_sext_z_store(i8 signext %a) {
+; CHECK-LABEL: test_lleqsc_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = sext i1 %cmp to i8
+ store i8 %conv2, i8* @glob, align 1
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testCompareslleqsi.ll b/test/CodeGen/PowerPC/testCompareslleqsi.ll
new file mode 100644
index 000000000000..90cf2c85888e
--- /dev/null
+++ b/test/CodeGen/PowerPC/testCompareslleqsi.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsi(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_lleqsi:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %conv1 = zext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsi_sext(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_lleqsi_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %conv1 = sext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsi_z(i32 signext %a) {
+; CHECK-LABEL: test_lleqsi_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %conv1 = zext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqsi_sext_z(i32 signext %a) {
+; CHECK-LABEL: test_lleqsi_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %conv1 = sext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsi_store(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_lleqsi_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stw r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsi_sext_store(i32 signext %a, i32 signext %b) {
+; CHECK-LABEL: test_lleqsi_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %sub = sext i1 %cmp to i32
+ store i32 %sub, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsi_z_store(i32 signext %a) {
+; CHECK-LABEL: test_lleqsi_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+; CHECKNEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqsi_sext_z_store(i32 signext %a) {
+; CHECK-LABEL: test_lleqsi_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %sub = sext i1 %cmp to i32
+ store i32 %sub, i32* @glob, align 4
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testCompareslleqss.ll b/test/CodeGen/PowerPC/testCompareslleqss.ll
new file mode 100644
index 000000000000..df60a6ccc00e
--- /dev/null
+++ b/test/CodeGen/PowerPC/testCompareslleqss.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i16 0, align 2
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqss(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_lleqss:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = zext i1 %cmp to i64
+ ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqss_sext(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_lleqss_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = sext i1 %cmp to i64
+ ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqss_z(i16 signext %a) {
+; CHECK-LABEL: test_lleqss_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = zext i1 %cmp to i64
+ ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_lleqss_sext_z(i16 signext %a) {
+; CHECK-LABEL: test_lleqss_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = sext i1 %cmp to i64
+ ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqss_store(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_lleqss_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: sth r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = zext i1 %cmp to i16
+ store i16 %conv3, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqss_sext_store(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: test_lleqss_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = sext i1 %cmp to i16
+ store i16 %conv3, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqss_z_store(i16 signext %a) {
+; CHECK-LABEL: test_lleqss_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = zext i1 %cmp to i16
+ store i16 %conv2, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_lleqss_sext_z_store(i16 signext %a) {
+; CHECK-LABEL: test_lleqss_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = sext i1 %cmp to i16
+ store i16 %conv2, i16* @glob, align 2
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesllequc.ll b/test/CodeGen/PowerPC/testComparesllequc.ll
new file mode 100644
index 000000000000..248825761295
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesllequc.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequc(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_llequc:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = zext i1 %cmp to i64
+ ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequc_sext(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_llequc_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = sext i1 %cmp to i64
+ ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequc_z(i8 zeroext %a) {
+; CHECK-LABEL: test_llequc_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = zext i1 %cmp to i64
+ ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequc_sext_z(i8 zeroext %a) {
+; CHECK-LABEL: test_llequc_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = sext i1 %cmp to i64
+ ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequc_store(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_llequc_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stb r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = zext i1 %cmp to i8
+ store i8 %conv3, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequc_sext_store(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: test_llequc_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, %b
+ %conv3 = sext i1 %cmp to i8
+ store i8 %conv3, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequc_z_store(i8 zeroext %a) {
+; CHECK-LABEL: test_llequc_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = zext i1 %cmp to i8
+ store i8 %conv2, i8* @glob, align 1
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequc_sext_z_store(i8 zeroext %a) {
+; CHECK-LABEL: test_llequc_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stb r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i8 %a, 0
+ %conv2 = sext i1 %cmp to i8
+ store i8 %conv2, i8* @glob, align 1
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesllequi.ll b/test/CodeGen/PowerPC/testComparesllequi.ll
new file mode 100644
index 000000000000..2342d80d94ef
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesllequi.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequi(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_llequi:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %conv1 = zext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequi_sext(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_llequi_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %conv1 = sext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequi_z(i32 zeroext %a) {
+; CHECK-LABEL: test_llequi_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %conv1 = zext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequi_sext_z(i32 zeroext %a) {
+; CHECK-LABEL: test_llequi_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %conv1 = sext i1 %cmp to i64
+ ret i64 %conv1
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequi_store(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_llequi_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stw r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequi_sext_store(i32 zeroext %a, i32 zeroext %b) {
+; CHECK-LABEL: test_llequi_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, %b
+ %sub = sext i1 %cmp to i32
+ store i32 %sub, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequi_z_store(i32 zeroext %a) {
+; CHECK-LABEL: test_llequi_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, i32* @glob, align 4
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequi_sext_z_store(i32 zeroext %a) {
+; CHECK-LABEL: test_llequi_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: stw r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %sub = sext i1 %cmp to i32
+ store i32 %sub, i32* @glob, align 4
+ ret void
+}
diff --git a/test/CodeGen/PowerPC/testComparesllequs.ll b/test/CodeGen/PowerPC/testComparesllequs.ll
new file mode 100644
index 000000000000..e79a974c06f5
--- /dev/null
+++ b/test/CodeGen/PowerPC/testComparesllequs.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \
+; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl
+
+@glob = common local_unnamed_addr global i16 0, align 2
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequs(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_llequs:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = zext i1 %cmp to i64
+ ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequs_sext(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_llequs_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = sext i1 %cmp to i64
+ ret i64 %conv3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequs_z(i16 zeroext %a) {
+; CHECK-LABEL: test_llequs_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = zext i1 %cmp to i64
+ ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @test_llequs_sext_z(i16 zeroext %a) {
+; CHECK-LABEL: test_llequs_sext_z:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = sext i1 %cmp to i64
+ ret i64 %conv2
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequs_store(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_llequs_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: ld r12, .LC0@toc@l(r5)
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: sth r3, 0(r12)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = zext i1 %cmp to i16
+ store i16 %conv3, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequs_sext_store(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: test_llequs_sext_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xor r3, r3, r4
+; CHECK-NEXT: addis r5, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r5)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, %b
+ %conv3 = sext i1 %cmp to i16
+ store i16 %conv3, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequs_z_store(i16 zeroext %a) {
+; CHECK-LABEL: test_llequs_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: srwi r3, r3, 5
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = zext i1 %cmp to i16
+ store i16 %conv2, i16* @glob, align 2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @test_llequs_sext_z_store(i16 zeroext %a) {
+; CHECK-LABEL: test_llequs_sext_z_store:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addis r4, r2, .LC0@toc@ha
+; CHECK-NEXT: cntlzw r3, r3
+; CHECK-NEXT: ld r4, .LC0@toc@l(r4)
+; CHECK-NEXT: rldicr r3, r3, 58, 0
+; CHECK-NEXT: sradi r3, r3, 63
+; CHECK-NEXT: sth r3, 0(r4)
+; CHECK-NEXT: blr
+entry:
+ %cmp = icmp eq i16 %a, 0
+ %conv2 = sext i1 %cmp to i16
+ store i16 %conv2, i16* @glob, align 2
+ ret void
+}
diff --git a/test/CodeGen/SPARC/LeonItinerariesUT.ll b/test/CodeGen/SPARC/LeonItinerariesUT.ll
index 87e0c4621c08..d586fe183a92 100644
--- a/test/CodeGen/SPARC/LeonItinerariesUT.ll
+++ b/test/CodeGen/SPARC/LeonItinerariesUT.ll
@@ -28,9 +28,9 @@
; LEON3_4_ITIN-LABEL: f32_ops:
; LEON3_4_ITIN: ld
; LEON3_4_ITIN-NEXT: ld
-; LEON3_4_ITIN-NEXT: ld
; LEON3_4_ITIN-NEXT: fadds
; LEON3_4_ITIN-NEXT: ld
+; LEON3_4_ITIN-NEXT: ld
; LEON3_4_ITIN-NEXT: fsubs
; LEON3_4_ITIN-NEXT: fmuls
; LEON3_4_ITIN-NEXT: retl
@@ -47,4 +47,4 @@ entry:
%6 = fmul float %5, %3
%7 = fdiv float %6, %4
ret float %7
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/SPARC/inlineasm-v9.ll b/test/CodeGen/SPARC/inlineasm-v9.ll
new file mode 100644
index 000000000000..9c5424c46229
--- /dev/null
+++ b/test/CodeGen/SPARC/inlineasm-v9.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=sparcv9 <%s | FileCheck %s
+
+;; Ensures that inline-asm accepts and uses 'f' and 'e' register constraints.
+; CHECK-LABEL: faddd:
+; CHECK: faddd %f0, %f2, %f0
+define double @faddd(double, double) local_unnamed_addr #2 {
+entry:
+ %2 = tail call double asm sideeffect "faddd $1, $2, $0;", "=f,f,e"(double %0, double %1) #7
+ ret double %2
+}
+
+; CHECK-LABEL: faddq:
+; CHECK: faddq %f0, %f4, %f0
+define fp128 @faddq(fp128, fp128) local_unnamed_addr #2 {
+entry:
+ %2 = tail call fp128 asm sideeffect "faddq $1, $2, $0;", "=f,f,e"(fp128 %0, fp128 %1) #7
+ ret fp128 %2
+}
+
+;; Ensure that 'e' can indeed go in the high area, and 'f' cannot.
+; CHECK-LABEL: faddd_high:
+; CHECK: fmovd %f2, %f32
+; CHECK: fmovd %f0, %f2
+; CHECK: faddd %f2, %f32, %f2
+define double @faddd_high(double, double) local_unnamed_addr #2 {
+entry:
+ %2 = tail call double asm sideeffect "faddd $1, $2, $0;", "=f,f,e,~{d0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7}"(double %0, double %1) #7
+ ret double %2
+}
+
diff --git a/test/CodeGen/SPARC/inlineasm.ll b/test/CodeGen/SPARC/inlineasm.ll
index af631f0d29f5..35a62706c1ab 100644
--- a/test/CodeGen/SPARC/inlineasm.ll
+++ b/test/CodeGen/SPARC/inlineasm.ll
@@ -94,3 +94,21 @@ entry:
%0 = call i64 asm sideeffect "xor $1, %g0, $0", "=r,0,~{i1}"(i64 5);
ret i64 %0
}
+
+
+;; Ensures that inline-asm accepts and uses 'f' and 'e' register constraints.
+; CHECK-LABEL: fadds:
+; CHECK: fadds %f0, %f1, %f0
+define float @fadds(float, float) local_unnamed_addr #2 {
+entry:
+ %2 = tail call float asm sideeffect "fadds $1, $2, $0;", "=f,f,e"(float %0, float %1) #7
+ ret float %2
+}
+
+; CHECK-LABEL: faddd:
+; CHECK: faddd %f0, %f2, %f0
+define double @faddd(double, double) local_unnamed_addr #2 {
+entry:
+ %2 = tail call double asm sideeffect "faddd $1, $2, $0;", "=f,f,e"(double %0, double %1) #7
+ ret double %2
+}
diff --git a/test/CodeGen/SystemZ/list-ilp-crash.ll b/test/CodeGen/SystemZ/list-ilp-crash.ll
new file mode 100644
index 000000000000..c67ed318b93f
--- /dev/null
+++ b/test/CodeGen/SystemZ/list-ilp-crash.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -pre-RA-sched=list-ilp | FileCheck %s
+;
+; Check that list-ilp scheduler does not crash due to SystemZ's current use
+; of MVT::Untyped.
+
+define void @pr32723(i8) {
+; CHECK: .text
+BB:
+ br label %CF245
+
+CF245: ; preds = %CF245, %BB
+ %Shuff57 = shufflevector <4 x i8> zeroinitializer, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %Cmp84 = icmp uge i8 %0, undef
+ br i1 %Cmp84, label %CF245, label %CF260
+
+CF260: ; preds = %CF245
+ %B156 = sdiv <4 x i8> %Shuff57, %Shuff57
+ br label %CF255
+
+CF255: ; preds = %CF255, %CF260
+ %I186 = insertelement <4 x i8> %B156, i8 %0, i32 2
+ br label %CF255
+}
diff --git a/test/CodeGen/SystemZ/lower-copy-undef-src.mir b/test/CodeGen/SystemZ/lower-copy-undef-src.mir
new file mode 100644
index 000000000000..322460d79d68
--- /dev/null
+++ b/test/CodeGen/SystemZ/lower-copy-undef-src.mir
@@ -0,0 +1,14 @@
+# RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -run-pass=postrapseudos -o - %s | FileCheck %s
+#
+# Test that a COPY with an undef source operand gets handled like an identity
+# copy rather than lowered into a target instruction with the undef flag
+# dropped.
+---
+# CHECK-LABEL: name: undef_copy
+# CHECK: %r13d = KILL undef %r0d, implicit killed %r12q, implicit-def %r12q
+name: undef_copy
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: %r12q
+ %r13d = COPY undef %r0d, implicit killed %r12q, implicit-def %r12q
diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll
index d8d60413cb0e..5e7a40299ed7 100644
--- a/test/CodeGen/Thumb2/v8_IT_5.ll
+++ b/test/CodeGen/Thumb2/v8_IT_5.ll
@@ -9,7 +9,7 @@
; CHECK-NEXT: b
; CHECK: [[JUMPTARGET]]:{{.*}}%if.else173
; CHECK-NEXT: mov.w
-; CHECK-NEXT: bx lr
+; CHECK-NEXT: pop
; CHECK-NEXT: %if.else145
; CHECK-NEXT: mov.w
diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll
index 4ec703921e29..24aa5b98d0bb 100644
--- a/test/CodeGen/X86/2007-01-08-InstrSched.ll
+++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll
@@ -13,10 +13,10 @@ define float @foo(float %x) nounwind {
; CHECK: mulss
; CHECK: mulss
-; CHECK: mulss
-; CHECK: mulss
; CHECK: addss
+; CHECK: mulss
; CHECK: addss
+; CHECK: mulss
; CHECK: addss
; CHECK: ret
}
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index 8b11fd86ef17..ae60d57bbf49 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -1,14 +1,19 @@
-; RUN: llc -march=x86 -O0 < %s | FileCheck %s
-; Currently, dbg.declare generates a DEBUG_VALUE comment. Eventually it will
-; generate DWARF and this test will need to be modified or removed.
+; RUN: llc -march=x86 -O0 < %s -filetype=obj | llvm-dwarfdump - | FileCheck %s
+; CHECK-LABEL: .debug_info contents:
+
+; CHECK-LABEL: DW_TAG_subprogram
+; CHECK: DW_AT_name [DW_FORM_strp] ( {{.*}}"foo")
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 {{..}} )
+; DW_OP_fbreg ??
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"my_r0")
%struct.Pt = type { double, double }
%struct.Rect = type { %struct.Pt, %struct.Pt }
define double @foo(%struct.Rect* byval %my_r0) nounwind ssp !dbg !1 {
entry:
-;CHECK: DEBUG_VALUE
%retval = alloca double ; <double*> [#uses=2]
%0 = alloca double ; <double*> [#uses=2]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
deleted file mode 100644
index 495ff0304b1b..000000000000
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
-; RUN: -verify-machineinstrs | FileCheck %s
-;
-; Test LiveInterval update handling of DBG_VALUE.
-; rdar://12777252.
-;
-; CHECK: %entry
-; CHECK: DEBUG_VALUE: subdivp:hg
-; CHECK: j
-
-%struct.node.0.27 = type { i16, double, [3 x double], i32, i32 }
-%struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] }
-%struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* }
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp !dbg !14 {
-entry:
- call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !14)
- %type = getelementptr inbounds %struct.node.0.27, %struct.node.0.27* %p, i64 0, i32 0
- %0 = load i16, i16* %type, align 2
- %cmp = icmp eq i16 %0, 1
- br i1 %cmp, label %return, label %for.cond.preheader
-
-for.cond.preheader: ; preds = %entry
- %arrayidx6.1 = getelementptr inbounds %struct.hgstruct.2.29, %struct.hgstruct.2.29* %hg, i64 0, i32 1, i64 1
- %cmp22 = fcmp olt double 0.000000e+00, %dsq
- %conv24 = zext i1 %cmp22 to i16
- br label %return
-
-return: ; preds = %for.cond.preheader, %entry
- %retval.0 = phi i16 [ %conv24, %for.cond.preheader ], [ 0, %entry ]
- ret i16 %retval.0
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!12}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !11, enums: !2, retainedTypes: !2, globals: !2)
-!2 = !{}
-!4 = !DILocalVariable(name: "hg", line: 725, arg: 4, scope: !14, file: !5, type: !6)
-!5 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh")
-!6 = !DIDerivedType(tag: DW_TAG_typedef, name: "hgstruct", line: 492, file: !11, baseType: !7)
-!7 = !DICompositeType(tag: DW_TAG_structure_type, line: 487, size: 512, align: 64, file: !11)
-!11 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh")
-!12 = !{i32 1, !"Debug Info Version", i32 3}
-!14 = distinct !DISubprogram(name: "subdivp", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !11, scope: !5, type: !15)
-!15 = !DISubroutineType(types: !16)
-!16 = !{null}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
deleted file mode 100644
index fbe6000d7ace..000000000000
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ /dev/null
@@ -1,142 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
-; RUN: -verify-machineinstrs | FileCheck %s
-;
-; Test MachineScheduler handling of DBG_VALUE.
-; rdar://12776937.
-;
-; CHECK: %if.else581
-; CHECK: DEBUG_VALUE: num1
-; CHECK: call
-
-%union.rec = type {}
-
-@.str15 = external hidden unnamed_addr constant [6 x i8], align 1
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp !dbg !21 {
-entry:
- %num14075 = alloca [20 x i8], align 16
- br label %if.end33
-
-if.end33: ; preds = %entry
- %cmp1733 = icmp eq i32 undef, 0
- br label %if.else581
-
-if.else581: ; preds = %if.end33
- %cmp586 = icmp eq i8 undef, -123
- br i1 %cmp586, label %if.then588, label %if.else594
-
-if.then588: ; preds = %if.else581
- br label %for.cond1710.preheader
-
-if.else594: ; preds = %if.else581
- unreachable
-
-for.cond1710.preheader: ; preds = %if.then588
- br label %for.cond1710
-
-for.cond1710: ; preds = %for.cond1710, %for.cond1710.preheader
- br i1 undef, label %for.cond1710, label %if.then3344
-
-if.then3344:
- br label %if.then4073
-
-if.then4073: ; preds = %if.then3344
- call void @llvm.dbg.declare(metadata [20 x i8]* %num14075, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !5)
- %arraydecay4078 = getelementptr inbounds [20 x i8], [20 x i8]* %num14075, i64 0, i64 0
- %0 = load i32, i32* undef, align 4
- %add4093 = add nsw i32 %0, 0
- %conv4094 = sitofp i32 %add4093 to float
- %div4095 = fdiv float %conv4094, 5.670000e+02
- %conv4096 = fpext float %div4095 to double
- %call4097 = call i32 (i8*, i32, i64, i8*, ...) @__sprintf_chk(i8* %arraydecay4078, i32 0, i64 20, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str15, i64 0, i64 0), double %conv4096) nounwind
- br i1 %cmp1733, label %if.then4107, label %if.else4114
-
-if.then4107: ; preds = %if.then4073
- unreachable
-
-if.else4114: ; preds = %if.then4073
- unreachable
-}
-
-declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!35}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !19, enums: !2, retainedTypes: !2, globals: !2)
-!1 = !{!2}
-!2 = !{}
-!4 = !DILocalVariable(name: "num1", line: 815, scope: !5, file: !14, type: !15)
-!5 = distinct !DILexicalBlock(line: 815, column: 0, file: !14, scope: !6)
-!6 = distinct !DILexicalBlock(line: 812, column: 0, file: !14, scope: !7)
-!7 = distinct !DILexicalBlock(line: 807, column: 0, file: !14, scope: !8)
-!8 = distinct !DILexicalBlock(line: 440, column: 0, file: !14, scope: !9)
-!9 = distinct !DILexicalBlock(line: 435, column: 0, file: !14, scope: !10)
-!10 = distinct !DILexicalBlock(line: 434, column: 0, file: !14, scope: !11)
-!11 = distinct !DILexicalBlock(line: 250, column: 0, file: !14, scope: !12)
-!12 = distinct !DILexicalBlock(line: 249, column: 0, file: !14, scope: !13)
-!13 = distinct !DILexicalBlock(line: 221, column: 0, file: !14, scope: !21)
-!14 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset")
-!15 = !DICompositeType(tag: DW_TAG_array_type, size: 160, align: 8, baseType: !16, elements: !17)
-!16 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!17 = !{!18}
-!18 = !DISubrange(count: 20)
-!19 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset")
-
-!21 = distinct !DISubprogram(name: "AttachGalley", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !19, scope: !14, type: !22)
-!22 = !DISubroutineType(types: !23)
-!23 = !{null}
-
-; Test DebugValue uses visited by RegisterPressureTracker findUseBetween().
-;
-; CHECK: @main
-; CHECK: DEBUG_VALUE: main:X
-; CHECK: call
-
-%"class.__gnu_cxx::hash_map" = type { %"class.__gnu_cxx::hashtable" }
-%"class.__gnu_cxx::hashtable" = type { i64, i64, i64, i64, i64, i64 }
-
-define void @main() uwtable ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !37 {
-entry:
- %X = alloca %"class.__gnu_cxx::hash_map", align 8
- br i1 undef, label %cond.true, label %cond.end
-
-cond.true: ; preds = %entry
- unreachable
-
-cond.end: ; preds = %entry
- call void @llvm.dbg.declare(metadata %"class.__gnu_cxx::hash_map"* %X, metadata !31, metadata !DIExpression()), !dbg !DILocation(scope: !37)
- %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map", %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5
- invoke void @_Znwm()
- to label %exit.i unwind label %lpad2.i.i.i.i
-
-exit.i: ; preds = %cond.end
- unreachable
-
-lpad2.i.i.i.i: ; preds = %cond.end
- %0 = landingpad { i8*, i32 }
- cleanup
- br i1 undef, label %lpad.body.i.i, label %if.then.i.i.i.i.i.i.i.i
-
-if.then.i.i.i.i.i.i.i.i: ; preds = %lpad2.i.i.i.i
- unreachable
-
-lpad.body.i.i: ; preds = %lpad2.i.i.i.i
- resume { i8*, i32 } %0
-}
-
-declare i32 @__gxx_personality_v0(...)
-
-declare void @_Znwm()
-
-!llvm.dbg.cu = !{!30}
-
-!30 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 169129) (llvm/trunk 169135)", isOptimized: true, emissionKind: FullDebug, file: !34, enums: !2, retainedTypes: !2)
-!31 = !DILocalVariable(name: "X", line: 29, scope: !37, type: !32)
-!32 = !DIDerivedType(tag: DW_TAG_typedef, name: "HM", line: 28, file: !34, baseType: null)
-!33 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++")
-!34 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++")
-!35 = !{i32 1, !"Debug Info Version", i32 3}
-!37 = distinct !DISubprogram(name: "main", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !30, scopeLine: 1, file: !19, scope: !14, type: !22)
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
deleted file mode 100644
index a717202d3574..000000000000
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \
-; RUN: -verify-machineinstrs | FileCheck %s
-;
-; Test RegisterPressure handling of DBG_VALUE.
-;
-; CHECK: %entry
-; CHECK: DEBUG_VALUE: test:callback
-; CHECK: ret
-
-%struct.btCompoundLeafCallback = type { i32, i32 }
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-define void @test() unnamed_addr uwtable ssp align 2 !dbg !2 {
-entry:
- %callback = alloca %struct.btCompoundLeafCallback, align 8
- br i1 undef, label %if.end, label %if.then
-
-if.then: ; preds = %entry
- unreachable
-
-if.end: ; preds = %entry
- call void @llvm.dbg.declare(metadata %struct.btCompoundLeafCallback* %callback, metadata !3, metadata !DIExpression()), !dbg !DILocation(scope: !2)
- %m = getelementptr inbounds %struct.btCompoundLeafCallback, %struct.btCompoundLeafCallback* %callback, i64 0, i32 1
- store i32 0, i32* undef, align 8
- %cmp12447 = icmp sgt i32 undef, 0
- br i1 %cmp12447, label %for.body.lr.ph, label %invoke.cont44
-
-for.body.lr.ph: ; preds = %if.end
- unreachable
-
-invoke.cont44: ; preds = %if.end
- ret void
-}
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 168984) (llvm/trunk 168983)", isOptimized: true, emissionKind: FullDebug, file: !6)
-!2 = distinct !DISubprogram(name: "test", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !6, scope: !5, type: !7)
-!3 = !DILocalVariable(name: "callback", line: 214, scope: !2, type: !4)
-!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "btCompoundLeafCallback", line: 90, size: 64, align: 64, file: !6)
-!5 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet")
-!6 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet")
-!7 = !DISubroutineType(types: !9)
-!8 = !{i32 1, !"Debug Info Version", i32 3}
-!9 = !{null}
diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll
new file mode 100644
index 000000000000..553bc2789ff0
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
+; ALL-LABEL: test_add_i64:
+; ALL: # BB#0:
+; ALL-NEXT: leaq (%rsi,%rdi), %rax
+; ALL-NEXT: retq
+ %ret = add i64 %arg1, %arg2
+ ret i64 %ret
+}
+
+define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
+; ALL-LABEL: test_add_i32:
+; ALL: # BB#0:
+; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ALL-NEXT: leal (%rsi,%rdi), %eax
+; ALL-NEXT: retq
+ %ret = add i32 %arg1, %arg2
+ ret i32 %ret
+}
+
+define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
+; ALL-LABEL: test_add_i16:
+; ALL: # BB#0:
+; ALL-NEXT: # kill: %DI<def> %DI<kill> %RDI<def>
+; ALL-NEXT: # kill: %SI<def> %SI<kill> %RSI<def>
+; ALL-NEXT: leal (%rsi,%rdi), %eax
+; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT: retq
+ %ret = add i16 %arg1, %arg2
+ ret i16 %ret
+}
+
+define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
+; ALL-LABEL: test_add_i8:
+; ALL: # BB#0:
+; ALL-NEXT: addb %dil, %sil
+; ALL-NEXT: movl %esi, %eax
+; ALL-NEXT: retq
+ %ret = add i8 %arg1, %arg2
+ ret i8 %ret
+}
diff --git a/test/CodeGen/X86/GlobalISel/binop.ll b/test/CodeGen/X86/GlobalISel/binop.ll
index bf4c42cb4292..1aae1db8ab07 100644
--- a/test/CodeGen/X86/GlobalISel/binop.ll
+++ b/test/CodeGen/X86/GlobalISel/binop.ll
@@ -4,48 +4,6 @@
; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512F
; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512VL
-define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
-; ALL-LABEL: test_add_i64:
-; ALL: # BB#0:
-; ALL-NEXT: leaq (%rsi,%rdi), %rax
-; ALL-NEXT: retq
- %ret = add i64 %arg1, %arg2
- ret i64 %ret
-}
-
-define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
-; ALL-LABEL: test_add_i32:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; ALL-NEXT: leal (%rsi,%rdi), %eax
-; ALL-NEXT: retq
- %ret = add i32 %arg1, %arg2
- ret i32 %ret
-}
-
-define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
-; ALL-LABEL: test_add_i16:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %DI<def> %DI<kill> %RDI<def>
-; ALL-NEXT: # kill: %SI<def> %SI<kill> %RSI<def>
-; ALL-NEXT: leal (%rsi,%rdi), %eax
-; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; ALL-NEXT: retq
- %ret = add i16 %arg1, %arg2
- ret i16 %ret
-}
-
-define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
-; ALL-LABEL: test_add_i8:
-; ALL: # BB#0:
-; ALL-NEXT: addb %dil, %sil
-; ALL-NEXT: movl %esi, %eax
-; ALL-NEXT: retq
- %ret = add i8 %arg1, %arg2
- ret i8 %ret
-}
-
define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
; ALL-LABEL: test_sub_i64:
; ALL: # BB#0:
diff --git a/test/CodeGen/X86/GlobalISel/br.ll b/test/CodeGen/X86/GlobalISel/br.ll
new file mode 100644
index 000000000000..faa6a0350337
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/br.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=x86_64-linux-gnu -global-isel %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+
+define void @uncondbr() {
+; CHECK-LABEL: uncondbr:
+; CHECK: # BB#1: # %entry
+; CHECK-NEXT: jmp .LBB0_3
+; CHECK-NEXT: .LBB0_2: # %end
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_3: # %bb2
+; CHECK-NEXT: jmp .LBB0_2
+entry:
+ br label %bb2
+end:
+ ret void
+bb2:
+ br label %end
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/cmp.ll b/test/CodeGen/X86/GlobalISel/cmp.ll
new file mode 100644
index 000000000000..03692bb6b1de
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/cmp.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL
+
+define i32 @test_icmp_eq_i8(i8 %a, i8 %b) {
+; ALL-LABEL: test_icmp_eq_i8:
+; ALL: # BB#0:
+; ALL-NEXT: cmpb %sil, %dil
+; ALL-NEXT: sete %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp eq i8 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_eq_i16(i16 %a, i16 %b) {
+; ALL-LABEL: test_icmp_eq_i16:
+; ALL: # BB#0:
+; ALL-NEXT: cmpw %si, %di
+; ALL-NEXT: sete %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp eq i16 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_eq_i64(i64 %a, i64 %b) {
+; ALL-LABEL: test_icmp_eq_i64:
+; ALL: # BB#0:
+; ALL-NEXT: cmpq %rsi, %rdi
+; ALL-NEXT: sete %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp eq i64 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_eq_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_eq_i32:
+; ALL: # BB#0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: sete %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp eq i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_ne_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_ne_i32:
+; ALL: # BB#0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: setne %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp ne i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_ugt_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_ugt_i32:
+; ALL: # BB#0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: seta %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp ugt i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_uge_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_uge_i32:
+; ALL: # BB#0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: setae %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp uge i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_ult_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_ult_i32:
+; ALL: # BB#0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: setb %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp ult i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_ule_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_ule_i32:
+; ALL: # BB#0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: setbe %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp ule i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_sgt_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_sgt_i32:
+; ALL: # BB#0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: setg %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp sgt i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_sge_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_sge_i32:
+; ALL: # BB#0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: setge %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp sge i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_slt_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_slt_i32:
+; ALL: # BB#0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: setl %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp slt i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
+define i32 @test_icmp_sle_i32(i32 %a, i32 %b) {
+; ALL-LABEL: test_icmp_sle_i32:
+; ALL: # BB#0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: setle %al
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %r = icmp sle i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
index c4d3566008b1..64cd0e70a4fd 100644
--- a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
+++ b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
@@ -1,7 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=X64
-; TODO merge with ext.ll after i64 sext suported on 32bit platform
+; TODO merge with ext.ll after i64 sext suported on 32bit platform
+
+define i64 @test_zext_i1(i8 %a) {
+; X64-LABEL: test_zext_i1:
+; X64: # BB#0:
+; X64-NEXT: # kill: %DIL<def> %DIL<kill> %RDI<def>
+; X64-NEXT: andq $1, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %val = trunc i8 %a to i1
+ %r = zext i1 %val to i64
+ ret i64 %r
+}
define i64 @test_sext_i8(i8 %val) {
; X64-LABEL: test_sext_i8:
diff --git a/test/CodeGen/X86/GlobalISel/ext.ll b/test/CodeGen/X86/GlobalISel/ext.ll
index 3c032686130e..4d4e3b05ca28 100644
--- a/test/CodeGen/X86/GlobalISel/ext.ll
+++ b/test/CodeGen/X86/GlobalISel/ext.ll
@@ -2,6 +2,24 @@
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=X64
; RUN: llc -mtriple=i386-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=X32
+define i32 @test_zext_i1(i32 %a) {
+; X64-LABEL: test_zext_i1:
+; X64: # BB#0:
+; X64-NEXT: andl $1, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_zext_i1:
+; X32: # BB#0:
+; X32-NEXT: leal 4(%esp), %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+ %val = trunc i32 %a to i1
+ %r = zext i1 %val to i32
+ ret i32 %r
+}
+
define i32 @test_zext_i8(i8 %val) {
; X64-LABEL: test_zext_i8:
; X64: # BB#0:
diff --git a/test/CodeGen/X86/GlobalISel/legalize-cmp.mir b/test/CodeGen/X86/GlobalISel/legalize-cmp.mir
new file mode 100644
index 000000000000..68ccbbba0a73
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-cmp.mir
@@ -0,0 +1,179 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
+
+--- |
+ define i32 @test_cmp_i8(i8 %a, i8 %b) {
+ %r = icmp ult i8 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_cmp_i16(i16 %a, i16 %b) {
+ %r = icmp ult i16 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_cmp_i32(i32 %a, i32 %b) {
+ %r = icmp ult i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_cmp_i64(i64 %a, i64 %b) {
+ %r = icmp ult i64 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_cmp_p0(i32* %a, i32* %b) {
+ %r = icmp ult i32* %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+...
+---
+name: test_cmp_i8
+# CHECK-LABEL: name: test_cmp_i8
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+# CHECK: %0(s8) = COPY %edi
+# CHECK-NEXT: %1(s8) = COPY %esi
+# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s8), %1
+# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1)
+# CHECK-NEXT: %eax = COPY %3(s32)
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s8) = COPY %edi
+ %1(s8) = COPY %esi
+ %2(s1) = G_ICMP intpred(ult), %0(s8), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_cmp_i16
+# CHECK-LABEL: name: test_cmp_i16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+# CHECK: %0(s16) = COPY %edi
+# CHECK-NEXT: %1(s16) = COPY %esi
+# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s16), %1
+# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1)
+# CHECK-NEXT: %eax = COPY %3(s32)
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s16) = COPY %edi
+ %1(s16) = COPY %esi
+ %2(s1) = G_ICMP intpred(ult), %0(s16), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_cmp_i32
+# CHECK-LABEL: name: test_cmp_i32
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+# CHECK: %0(s32) = COPY %edi
+# CHECK-NEXT: %1(s32) = COPY %esi
+# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s32), %1
+# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1)
+# CHECK-NEXT: %eax = COPY %3(s32)
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(ult), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_cmp_i64
+# CHECK-LABEL: name: test_cmp_i64
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+# CHECK: %0(s64) = COPY %rdi
+# CHECK-NEXT: %1(s64) = COPY %rsi
+# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s64), %1
+# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1)
+# CHECK-NEXT: %eax = COPY %3(s32)
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi, %rsi
+
+ %0(s64) = COPY %rdi
+ %1(s64) = COPY %rsi
+ %2(s1) = G_ICMP intpred(ult), %0(s64), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_cmp_p0
+# CHECK-LABEL: name: test_cmp_p0
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+# CHECK: %0(p0) = COPY %rdi
+# CHECK-NEXT: %1(p0) = COPY %rsi
+# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(p0), %1
+# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1)
+# CHECK-NEXT: %eax = COPY %3(s32)
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi, %rsi
+
+ %0(p0) = COPY %rdi
+ %1(p0) = COPY %rsi
+ %2(s1) = G_ICMP intpred(ult), %0(p0), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
index 25af600f2299..6f051f1b6ea5 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
@@ -1,6 +1,12 @@
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
+ define i64 @test_sext_i1(i8 %a) {
+ %val = trunc i8 %a to i1
+ %r = sext i1 %val to i64
+ ret i64 %r
+ }
+
define i64 @test_sext_i8(i8 %val) {
%r = sext i8 %val to i64
ret i64 %r
@@ -16,6 +22,12 @@
ret i64 %r
}
+ define i64 @test_zext_i1(i8 %a) {
+ %val = trunc i8 %a to i1
+ %r = zext i1 %val to i64
+ ret i64 %r
+ }
+
define i64 @test_zext_i8(i8 %val) {
%r = zext i8 %val to i64
ret i64 %r
@@ -33,6 +45,32 @@
...
---
+name: test_sext_i1
+# CHECK-LABEL: name: test_sext_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+# CHECK: %0(s8) = COPY %edi
+# CHECK-NEXT: %1(s1) = G_TRUNC %0(s8)
+# CHECK-NEXT: %2(s64) = G_SEXT %1(s1)
+# CHECK-NEXT: %rax = COPY %2(s64)
+# CHECK-NEXT: RET 0, implicit %rax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s8) = COPY %edi
+ %1(s1) = G_TRUNC %0(s8)
+ %2(s64) = G_SEXT %1(s1)
+ %rax = COPY %2(s64)
+ RET 0, implicit %rax
+
+...
+---
name: test_sext_i8
# CHECK-LABEL: name: test_sext_i8
alignment: 4
@@ -102,6 +140,32 @@ body: |
...
---
+name: test_zext_i1
+# CHECK-LABEL: name: test_zext_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+# CHECK: %0(s8) = COPY %edi
+# CHECK-NEXT: %1(s1) = G_TRUNC %0(s8)
+# CHECK-NEXT: %2(s64) = G_ZEXT %1(s1)
+# CHECK-NEXT: %rax = COPY %2(s64)
+# CHECK-NEXT: RET 0, implicit %rax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s8) = COPY %edi
+ %1(s1) = G_TRUNC %0(s8)
+ %2(s64) = G_ZEXT %1(s1)
+ %rax = COPY %2(s64)
+ RET 0, implicit %rax
+
+...
+---
name: test_zext_i8
# CHECK-LABEL: name: test_zext_i8
alignment: 4
diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
index 46457e0fff59..c9add0dc4e95 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
@@ -1,6 +1,12 @@
# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
--- |
+ define i32 @test_zext_i1(i8 %a) {
+ %val = trunc i8 %a to i1
+ %r = zext i1 %val to i32
+ ret i32 %r
+ }
+
define i32 @test_zext_i8(i8 %val) {
%r = zext i8 %val to i32
ret i32 %r
@@ -11,6 +17,12 @@
ret i32 %r
}
+ define i32 @test_sext_i1(i8 %a) {
+ %val = trunc i8 %a to i1
+ %r = sext i1 %val to i32
+ ret i32 %r
+ }
+
define i32 @test_sext_i8(i8 %val) {
%r = sext i8 %val to i32
ret i32 %r
@@ -23,6 +35,32 @@
...
---
+name: test_zext_i1
+# ALL-LABEL: name: test_zext_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+# ALL: %0(s8) = COPY %edi
+# ALL-NEXT: %1(s1) = G_TRUNC %0(s8)
+# ALL-NEXT: %2(s32) = G_ZEXT %1(s1)
+# ALL-NEXT: %eax = COPY %2(s32)
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s8) = COPY %edi
+ %1(s1) = G_TRUNC %0(s8)
+ %2(s32) = G_ZEXT %1(s1)
+ %eax = COPY %2(s32)
+ RET 0, implicit %eax
+
+...
+---
name: test_zext_i8
# ALL-LABEL: name: test_zext_i8
alignment: 4
@@ -69,6 +107,32 @@ body: |
...
---
+name: test_sext_i1
+# ALL-LABEL: name: test_sext_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+# ALL: %0(s8) = COPY %edi
+# ALL-NEXT: %1(s1) = G_TRUNC %0(s8)
+# ALL-NEXT: %2(s32) = G_SEXT %1(s1)
+# ALL-NEXT: %eax = COPY %2(s32)
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s8) = COPY %edi
+ %1(s1) = G_TRUNC %0(s8)
+ %2(s32) = G_SEXT %1(s1)
+ %eax = COPY %2(s32)
+ RET 0, implicit %eax
+
+...
+---
name: test_sext_i8
# ALL-LABEL: name: test_sext_i8
alignment: 4
diff --git a/test/CodeGen/X86/GlobalISel/memop-x32.ll b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
index 49a7fd79f8b2..49a7fd79f8b2 100644
--- a/test/CodeGen/X86/GlobalISel/memop-x32.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
diff --git a/test/CodeGen/X86/GlobalISel/memop.ll b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
index a7407c0e6b75..3e45a9c9a49d 100644
--- a/test/CodeGen/X86/GlobalISel/memop.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
@@ -1,13 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST
-; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX_FAST
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX_GREEDY
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX512F_FAST
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512F_GREEDY
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX512VL_FAST
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512VL_GREEDY
-
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_FAST
+; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_GREEDY
define i8 @test_load_i8(i8 * %p1) {
; ALL-LABEL: test_load_i8:
@@ -77,34 +70,6 @@ define double @test_load_double(double * %p1) {
ret double %r
}
-define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) {
-; SSE-LABEL: test_load_v4i32_noalign:
-; SSE: # BB#0:
-; SSE-NEXT: movups (%rdi), %xmm0
-; SSE-NEXT: retq
-;
-; ALL_AVX-LABEL: test_load_v4i32_noalign:
-; ALL_AVX: # BB#0:
-; ALL_AVX-NEXT: vmovups (%rdi), %xmm0
-; ALL_AVX-NEXT: retq
- %r = load <4 x i32>, <4 x i32>* %p1, align 1
- ret <4 x i32> %r
-}
-
-define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) {
-; SSE-LABEL: test_load_v4i32_align:
-; SSE: # BB#0:
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: retq
-;
-; ALL_AVX-LABEL: test_load_v4i32_align:
-; ALL_AVX: # BB#0:
-; ALL_AVX-NEXT: vmovaps (%rdi), %xmm0
-; ALL_AVX-NEXT: retq
- %r = load <4 x i32>, <4 x i32>* %p1, align 16
- ret <4 x i32> %r
-}
-
define i32 * @test_store_i32(i32 %val, i32 * %p1) {
; ALL-LABEL: test_store_i32:
; ALL: # BB#0:
@@ -139,19 +104,6 @@ define float * @test_store_float(float %val, float * %p1) {
; SSE_GREEDY-NEXT: movss %xmm0, (%rdi)
; SSE_GREEDY-NEXT: movq %rdi, %rax
; SSE_GREEDY-NEXT: retq
-;
-; ALL_AVX_FAST-LABEL: test_store_float:
-; ALL_AVX_FAST: # BB#0:
-; ALL_AVX_FAST-NEXT: vmovd %xmm0, %eax
-; ALL_AVX_FAST-NEXT: movl %eax, (%rdi)
-; ALL_AVX_FAST-NEXT: movq %rdi, %rax
-; ALL_AVX_FAST-NEXT: retq
-;
-; ALL_AVX_GREEDY-LABEL: test_store_float:
-; ALL_AVX_GREEDY: # BB#0:
-; ALL_AVX_GREEDY-NEXT: vmovss %xmm0, (%rdi)
-; ALL_AVX_GREEDY-NEXT: movq %rdi, %rax
-; ALL_AVX_GREEDY-NEXT: retq
store float %val, float* %p1
ret float * %p1;
}
@@ -171,18 +123,6 @@ define double * @test_store_double(double %val, double * %p1) {
; SSE_GREEDY-NEXT: movq %rdi, %rax
; SSE_GREEDY-NEXT: retq
;
-; ALL_AVX_FAST-LABEL: test_store_double:
-; ALL_AVX_FAST: # BB#0:
-; ALL_AVX_FAST-NEXT: vmovq %xmm0, %rax
-; ALL_AVX_FAST-NEXT: movq %rax, (%rdi)
-; ALL_AVX_FAST-NEXT: movq %rdi, %rax
-; ALL_AVX_FAST-NEXT: retq
-;
-; ALL_AVX_GREEDY-LABEL: test_store_double:
-; ALL_AVX_GREEDY: # BB#0:
-; ALL_AVX_GREEDY-NEXT: vmovsd %xmm0, (%rdi)
-; ALL_AVX_GREEDY-NEXT: movq %rdi, %rax
-; ALL_AVX_GREEDY-NEXT: retq
store double %val, double* %p1
ret double * %p1;
}
diff --git a/test/CodeGen/X86/GlobalISel/memop-vec.ll b/test/CodeGen/X86/GlobalISel/memop-vec.ll
new file mode 100644
index 000000000000..e218fded4d5f
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/memop-vec.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+
+define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) {
+; ALL-LABEL: test_load_v4i32_noalign:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups (%rdi), %xmm0
+; ALL-NEXT: retq
+ %r = load <4 x i32>, <4 x i32>* %p1, align 1
+ ret <4 x i32> %r
+}
+
+define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) {
+; ALL-LABEL: test_load_v4i32_align:
+; ALL: # BB#0:
+; ALL-NEXT: vmovaps (%rdi), %xmm0
+; ALL-NEXT: retq
+ %r = load <4 x i32>, <4 x i32>* %p1, align 16
+ ret <4 x i32> %r
+}
+
+define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
+; ALL-LABEL: test_store_v4i32_noalign:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups %xmm0, (%rdi)
+; ALL-NEXT: retq
+ store <4 x i32> %val, <4 x i32>* %p1, align 1
+ ret void
+}
+
+define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
+; ALL-LABEL: test_store_v4i32_align:
+; ALL: # BB#0:
+; ALL-NEXT: vmovaps %xmm0, (%rdi)
+; ALL-NEXT: retq
+ store <4 x i32> %val, <4 x i32>* %p1, align 16
+ ret void
+}
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
index 3a65a9003773..1ea922ee475a 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
@@ -2,11 +2,6 @@
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -regbankselect-greedy -run-pass=regbankselect %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=GREEDY
--- |
- ; ModuleID = 'tmp.ll'
- source_filename = "tmp.ll"
- target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
- target triple = "x86_64--linux-gnu"
-
define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
%ret = add i8 %arg1, %arg2
ret i8 %ret
@@ -120,6 +115,26 @@
ret void
}
+ define i1 @test_icmp_eq_i8(i8 %a, i8 %b) {
+ %r = icmp eq i8 %a, %b
+ ret i1 %r
+ }
+
+ define i1 @test_icmp_eq_i16(i16 %a, i16 %b) {
+ %r = icmp eq i16 %a, %b
+ ret i1 %r
+ }
+
+ define i1 @test_icmp_eq_i32(i32 %a, i32 %b) {
+ %r = icmp eq i32 %a, %b
+ ret i1 %r
+ }
+
+ define i1 @test_icmp_eq_i64(i64 %a, i64 %b) {
+ %r = icmp eq i64 %a, %b
+ ret i1 %r
+ }
+
...
---
name: test_add_i8
@@ -735,3 +750,103 @@ body: |
RET 0
...
+---
+name: test_icmp_eq_i8
+# CHECK-LABEL: name: test_icmp_eq_i8
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: gpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s8) = COPY %edi
+ %1(s8) = COPY %esi
+ %2(s1) = G_ICMP intpred(eq), %0(s8), %1
+ %al = COPY %2(s1)
+ RET 0, implicit %al
+
+...
+---
+name: test_icmp_eq_i16
+# CHECK-LABEL: name: test_icmp_eq_i16
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: gpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s16) = COPY %edi
+ %1(s16) = COPY %esi
+ %2(s1) = G_ICMP intpred(eq), %0(s16), %1
+ %al = COPY %2(s1)
+ RET 0, implicit %al
+
+...
+---
+name: test_icmp_eq_i32
+# CHECK-LABEL: name: test_icmp_eq_i32
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: gpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(eq), %0(s32), %1
+ %al = COPY %2(s1)
+ RET 0, implicit %al
+
+...
+---
+name: test_icmp_eq_i64
+# CHECK-LABEL: name: test_icmp_eq_i64
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: gpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi, %rsi
+
+ %0(s64) = COPY %rdi
+ %1(s64) = COPY %rsi
+ %2(s1) = G_ICMP intpred(eq), %0(s64), %1
+ %al = COPY %2(s1)
+ RET 0, implicit %al
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-br.mir b/test/CodeGen/X86/GlobalISel/select-br.mir
new file mode 100644
index 000000000000..6d8cd2b1367d
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-br.mir
@@ -0,0 +1,39 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+
+--- |
+ define void @uncondbr() {
+ entry:
+ br label %bb2
+
+ end: ; preds = %bb2
+ ret void
+
+ bb2: ; preds = %entry
+ br label %end
+ }
+
+...
+---
+name: uncondbr
+# CHECK-LABEL: name: uncondbr
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: JMP_1 %bb.2.bb2
+# CHECK: JMP_1 %bb.1.end
+body: |
+ bb.1.entry:
+ successors: %bb.3.bb2(0x80000000)
+
+ G_BR %bb.3.bb2
+
+ bb.2.end:
+ RET 0
+
+ bb.3.bb2:
+ successors: %bb.2.end(0x80000000)
+
+ G_BR %bb.2.end
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-cmp.mir b/test/CodeGen/X86/GlobalISel/select-cmp.mir
new file mode 100644
index 000000000000..1d3da6cb88b9
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-cmp.mir
@@ -0,0 +1,563 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK
+
+--- |
+ define i32 @test_icmp_eq_i8(i8 %a, i8 %b) {
+ %r = icmp eq i8 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_eq_i16(i16 %a, i16 %b) {
+ %r = icmp eq i16 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_eq_i64(i64 %a, i64 %b) {
+ %r = icmp eq i64 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_eq_i32(i32 %a, i32 %b) {
+ %r = icmp eq i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_ne_i32(i32 %a, i32 %b) {
+ %r = icmp ne i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_ugt_i32(i32 %a, i32 %b) {
+ %r = icmp ugt i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_uge_i32(i32 %a, i32 %b) {
+ %r = icmp uge i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_ult_i32(i32 %a, i32 %b) {
+ %r = icmp ult i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_ule_i32(i32 %a, i32 %b) {
+ %r = icmp ule i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_sgt_i32(i32 %a, i32 %b) {
+ %r = icmp sgt i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_sge_i32(i32 %a, i32 %b) {
+ %r = icmp sge i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_slt_i32(i32 %a, i32 %b) {
+ %r = icmp slt i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+ define i32 @test_icmp_sle_i32(i32 %a, i32 %b) {
+ %r = icmp sle i32 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+ }
+
+...
+---
+name: test_icmp_eq_i8
+# CHECK-LABEL: name: test_icmp_eq_i8
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr8 }
+# CHECK-NEXT: - { id: 1, class: gr8 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %dil
+# CHECK-NEXT: %1 = COPY %sil
+# CHECK-NEXT: CMP8rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETEr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s8) = COPY %edi
+ %1(s8) = COPY %esi
+ %2(s1) = G_ICMP intpred(eq), %0(s8), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_eq_i16
+# CHECK-LABEL: name: test_icmp_eq_i16
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr16 }
+# CHECK-NEXT: - { id: 1, class: gr16 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %di
+# CHECK-NEXT: %1 = COPY %si
+# CHECK-NEXT: CMP16rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETEr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s16) = COPY %edi
+ %1(s16) = COPY %esi
+ %2(s1) = G_ICMP intpred(eq), %0(s16), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_eq_i64
+# CHECK-LABEL: name: test_icmp_eq_i64
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr64 }
+# CHECK-NEXT: - { id: 1, class: gr64 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %rdi
+# CHECK-NEXT: %1 = COPY %rsi
+# CHECK-NEXT: CMP64rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETEr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi, %rsi
+
+ %0(s64) = COPY %rdi
+ %1(s64) = COPY %rsi
+ %2(s1) = G_ICMP intpred(eq), %0(s64), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_eq_i32
+# CHECK-LABEL: name: test_icmp_eq_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %edi
+# CHECK-NEXT: %1 = COPY %esi
+# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETEr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(eq), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_ne_i32
+# CHECK-LABEL: name: test_icmp_ne_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %edi
+# CHECK-NEXT: %1 = COPY %esi
+# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETNEr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(ne), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_ugt_i32
+# CHECK-LABEL: name: test_icmp_ugt_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %edi
+# CHECK-NEXT: %1 = COPY %esi
+# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETAr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(ugt), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_uge_i32
+# CHECK-LABEL: name: test_icmp_uge_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %edi
+# CHECK-NEXT: %1 = COPY %esi
+# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETAEr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(uge), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_ult_i32
+# CHECK-LABEL: name: test_icmp_ult_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %edi
+# CHECK-NEXT: %1 = COPY %esi
+# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETBr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(ult), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_ule_i32
+# CHECK-LABEL: name: test_icmp_ule_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %edi
+# CHECK-NEXT: %1 = COPY %esi
+# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETBEr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(ule), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_sgt_i32
+# CHECK-LABEL: name: test_icmp_sgt_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %edi
+# CHECK-NEXT: %1 = COPY %esi
+# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETGr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(sgt), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_sge_i32
+# CHECK-LABEL: name: test_icmp_sge_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %edi
+# CHECK-NEXT: %1 = COPY %esi
+# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETGEr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(sge), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_slt_i32
+# CHECK-LABEL: name: test_icmp_slt_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %edi
+# CHECK-NEXT: %1 = COPY %esi
+# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETLr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(slt), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_icmp_sle_i32
+# CHECK-LABEL: name: test_icmp_sle_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gr32 }
+# CHECK-NEXT: - { id: 1, class: gr32 }
+# CHECK-NEXT: - { id: 2, class: gr8 }
+# CHECK-NEXT: - { id: 3, class: gr32 }
+# CHECK-NEXT: - { id: 4, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# CHECK: %0 = COPY %edi
+# CHECK-NEXT: %1 = COPY %esi
+# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
+# CHECK-NEXT: %2 = SETLEr implicit %eflags
+# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
+# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s1) = G_ICMP intpred(sle), %0(s32), %1
+ %3(s32) = G_ZEXT %2(s1)
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
index 85b3f61a9e44..0844701487bc 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
@@ -1,6 +1,12 @@
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
--- |
+ define i64 @test_zext_i1(i8 %a) {
+ %val = trunc i8 %a to i1
+ %r = zext i1 %val to i64
+ ret i64 %r
+ }
+
define i64 @test_sext_i8(i8 %val) {
%r = sext i8 %val to i64
ret i64 %r
@@ -13,6 +19,38 @@
...
---
+name: test_zext_i1
+# ALL-LABEL: name: test_zext_i1
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr8 }
+# ALL-NEXT: - { id: 1, class: gr8 }
+# ALL-NEXT: - { id: 2, class: gr64 }
+# ALL-NEXT: - { id: 3, class: gr64 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+# ALL: %0 = COPY %dil
+# ALL-NEXT: %1 = COPY %0
+# ALL-NEXT: %3 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT: %2 = AND64ri8 %3, 1, implicit-def %eflags
+# ALL-NEXT: %rax = COPY %2
+# ALL-NEXT: RET 0, implicit %rax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s8) = COPY %edi
+ %1(s1) = G_TRUNC %0(s8)
+ %2(s64) = G_ZEXT %1(s1)
+ %rax = COPY %2(s64)
+ RET 0, implicit %rax
+
+...
+---
name: test_sext_i8
# ALL-LABEL: name: test_sext_i8
alignment: 4
diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir
index 63aeae89bd1a..831d6efb75f1 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext.mir
@@ -2,6 +2,11 @@
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
--- |
+ define i32 @test_zext_i1(i1 %a) {
+ %r = zext i1 %a to i32
+ ret i32 %r
+ }
+
define i32 @test_zext_i8(i8 %val) {
%r = zext i8 %val to i32
ret i32 %r
@@ -24,6 +29,34 @@
...
---
+name: test_zext_i1
+# ALL-LABEL: name: test_zext_i1
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr8 }
+# ALL-NEXT: - { id: 1, class: gr32 }
+# ALL-NEXT: - { id: 2, class: gr32 }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+# ALL: %0 = COPY %dil
+# ALL-NEXT: %2 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %1 = AND32ri8 %2, 1, implicit-def %eflags
+# ALL-NEXT: %eax = COPY %1
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s1) = COPY %edi
+ %1(s32) = G_ZEXT %0(s1)
+ %eax = COPY %1(s32)
+ RET 0, implicit %eax
+
+...
+---
name: test_zext_i8
# ALL-LABEL: name: test_zext_i8
alignment: 4
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-x32.mir b/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir
index 8e6a2771db6e..8e6a2771db6e 100644
--- a/test/CodeGen/X86/GlobalISel/select-memop-x32.mir
+++ b/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir
diff --git a/test/CodeGen/X86/GlobalISel/select-memop.mir b/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir
index 817dc3cc9764..b57c9b0cca98 100644
--- a/test/CodeGen/X86/GlobalISel/select-memop.mir
+++ b/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir
@@ -34,7 +34,6 @@
ret float %r
}
-
define double @test_load_double(double* %p1) {
%r = load double, double* %p1
ret double %r
@@ -45,16 +44,6 @@
ret double %r
}
- define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) {
- %r = load <4 x i32>, <4 x i32>* %p1, align 1
- ret <4 x i32> %r
- }
-
- define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) {
- %r = load <4 x i32>, <4 x i32>* %p1, align 16
- ret <4 x i32> %r
- }
-
define i32* @test_store_i32(i32 %val, i32* %p1) {
store i32 %val, i32* %p1
ret i32* %p1
@@ -85,16 +74,6 @@
ret double* %p1
}
- define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
- store <4 x i32> %val, <4 x i32>* %p1, align 16
- ret <4 x i32>* %p1
- }
-
- define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
- store <4 x i32> %val, <4 x i32>* %p1, align 1
- ret <4 x i32>* %p1
- }
-
define i32* @test_load_ptr(i32** %ptr1) {
%p = load i32*, i32** %ptr1
ret i32* %p
@@ -304,62 +283,6 @@ body: |
...
---
-# ALL-LABEL: name: test_load_v4i32_noalign
-name: test_load_v4i32_noalign
-alignment: 4
-legalized: true
-regBankSelected: true
-registers:
-# ALL: - { id: 0, class: gr64 }
-# NO_AVX512F: - { id: 1, class: vr128 }
-# AVX512ALL: - { id: 1, class: vr128x }
- - { id: 0, class: gpr }
- - { id: 1, class: vecr }
-# ALL: %0 = COPY %rdi
-# SSE: %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# AVX: %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# AVX512F: %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# ALL: %xmm0 = COPY %1
-body: |
- bb.1 (%ir-block.0):
- liveins: %rdi
-
- %0(p0) = COPY %rdi
- %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1)
- %xmm0 = COPY %1(<4 x s32>)
- RET 0, implicit %xmm0
-
-...
----
-# ALL-LABEL: name: test_load_v4i32_align
-name: test_load_v4i32_align
-alignment: 4
-legalized: true
-regBankSelected: true
-registers:
-# ALL: - { id: 0, class: gr64 }
-# NO_AVX512F: - { id: 1, class: vr128 }
-# AVX512ALL: - { id: 1, class: vr128x }
- - { id: 0, class: gpr }
- - { id: 1, class: vecr }
-# ALL: %0 = COPY %rdi
-# SSE: %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# AVX: %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# AVX512F: %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# ALL: %xmm0 = COPY %1
-body: |
- bb.1 (%ir-block.0):
- liveins: %rdi
-
- %0(p0) = COPY %rdi
- %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1)
- %xmm0 = COPY %1(<4 x s32>)
- RET 0, implicit %xmm0
-
-...
----
# ALL-LABEL: name: test_store_i32
name: test_store_i32
alignment: 4
@@ -530,66 +453,6 @@ body: |
...
---
-# ALL-LABEL: name: test_store_v4i32_align
-name: test_store_v4i32_align
-alignment: 4
-legalized: true
-regBankSelected: true
-registers:
-# NO_AVX512F: - { id: 0, class: vr128 }
-# AVX512ALL: - { id: 0, class: vr128x }
-# ALL: - { id: 1, class: gr64 }
- - { id: 0, class: vecr }
- - { id: 1, class: gpr }
-# ALL: %0 = COPY %xmm0
-# ALL: %1 = COPY %rdi
-# SSE: MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# AVX: VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# AVX512F: VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# AVX512VL: VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# ALL: %rax = COPY %1
-body: |
- bb.1 (%ir-block.0):
- liveins: %rdi, %xmm0
-
- %0(<4 x s32>) = COPY %xmm0
- %1(p0) = COPY %rdi
- G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16)
- %rax = COPY %1(p0)
- RET 0, implicit %rax
-
-...
----
-# ALL-LABEL: name: test_store_v4i32_noalign
-name: test_store_v4i32_noalign
-alignment: 4
-legalized: true
-regBankSelected: true
-registers:
-# NO_AVX512F: - { id: 0, class: vr128 }
-# AVX512ALL: - { id: 0, class: vr128x }
-# ALL: - { id: 1, class: gr64 }
- - { id: 0, class: vecr }
- - { id: 1, class: gpr }
-# ALL: %0 = COPY %xmm0
-# ALL: %1 = COPY %rdi
-# SSE: MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# AVX: VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# AVX512F: VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# AVX512VL: VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# ALL: %rax = COPY %1
-body: |
- bb.1 (%ir-block.0):
- liveins: %rdi, %xmm0
-
- %0(<4 x s32>) = COPY %xmm0
- %1(p0) = COPY %rdi
- G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1)
- %rax = COPY %1(p0)
- RET 0, implicit %rax
-
-...
----
# ALL-LABEL: name: test_load_ptr
name: test_load_ptr
alignment: 4
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v128.mir b/test/CodeGen/X86/GlobalISel/select-memop-v128.mir
new file mode 100644
index 000000000000..ce3f6b91dcf6
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-memop-v128.mir
@@ -0,0 +1,143 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL
+
+--- |
+ define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) {
+ %r = load <4 x i32>, <4 x i32>* %p1, align 1
+ ret <4 x i32> %r
+ }
+
+ define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) {
+ %r = load <4 x i32>, <4 x i32>* %p1, align 16
+ ret <4 x i32> %r
+ }
+
+ define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
+ store <4 x i32> %val, <4 x i32>* %p1, align 16
+ ret <4 x i32>* %p1
+ }
+
+ define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
+ store <4 x i32> %val, <4 x i32>* %p1, align 1
+ ret <4 x i32>* %p1
+ }
+
+...
+---
+# ALL-LABEL: name: test_load_v4i32_noalign
+name: test_load_v4i32_noalign
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+# ALL: - { id: 0, class: gr64 }
+# NO_AVX512F: - { id: 1, class: vr128 }
+# AVX512ALL: - { id: 1, class: vr128x }
+ - { id: 0, class: gpr }
+ - { id: 1, class: vecr }
+# ALL: %0 = COPY %rdi
+# SSE: %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# AVX: %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# AVX512F: %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# ALL: %xmm0 = COPY %1
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi
+
+ %0(p0) = COPY %rdi
+ %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1)
+ %xmm0 = COPY %1(<4 x s32>)
+ RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name: test_load_v4i32_align
+name: test_load_v4i32_align
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+# ALL: - { id: 0, class: gr64 }
+# NO_AVX512F: - { id: 1, class: vr128 }
+# AVX512ALL: - { id: 1, class: vr128x }
+ - { id: 0, class: gpr }
+ - { id: 1, class: vecr }
+# ALL: %0 = COPY %rdi
+# SSE: %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# AVX: %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# AVX512F: %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# ALL: %xmm0 = COPY %1
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi
+
+ %0(p0) = COPY %rdi
+ %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1)
+ %xmm0 = COPY %1(<4 x s32>)
+ RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name: test_store_v4i32_align
+name: test_store_v4i32_align
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+# NO_AVX512F: - { id: 0, class: vr128 }
+# AVX512ALL: - { id: 0, class: vr128x }
+# ALL: - { id: 1, class: gr64 }
+ - { id: 0, class: vecr }
+ - { id: 1, class: gpr }
+# ALL: %0 = COPY %xmm0
+# ALL: %1 = COPY %rdi
+# SSE: MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# AVX: VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# AVX512F: VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# AVX512VL: VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# ALL: %rax = COPY %1
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi, %xmm0
+
+ %0(<4 x s32>) = COPY %xmm0
+ %1(p0) = COPY %rdi
+ G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16)
+ %rax = COPY %1(p0)
+ RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name: test_store_v4i32_noalign
+name: test_store_v4i32_noalign
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+# NO_AVX512F: - { id: 0, class: vr128 }
+# AVX512ALL: - { id: 0, class: vr128x }
+# ALL: - { id: 1, class: gr64 }
+ - { id: 0, class: vecr }
+ - { id: 1, class: gpr }
+# ALL: %0 = COPY %xmm0
+# ALL: %1 = COPY %rdi
+# SSE: MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# AVX: VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# AVX512F: VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# AVX512VL: VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# ALL: %rax = COPY %1
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %rdi, %xmm0
+
+ %0(<4 x s32>) = COPY %xmm0
+ %1(p0) = COPY %rdi
+ G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1)
+ %rax = COPY %1(p0)
+ RET 0, implicit %rax
+
+...
diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll
new file mode 100644
index 000000000000..262cb96ca6d8
--- /dev/null
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+; CHECK-LABEL: Pass Arguments:
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT: Target Transform Information
+; CHECK-NEXT: Target Pass Configuration
+; CHECK-NEXT: Type-Based Alias Analysis
+; CHECK-NEXT: Scoped NoAlias Alias Analysis
+; CHECK-NEXT: Assumption Cache Tracker
+; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Machine Module Information
+; CHECK-NEXT: Machine Branch Probability Analysis
+; CHECK-NEXT: ModulePass Manager
+; CHECK-NEXT: Pre-ISel Intrinsic Lowering
+; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand Atomic instructions
+; CHECK-NEXT: Dominator Tree Construction
+; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT: Module Verifier
+; CHECK-NEXT: Lower Garbage Collection Instructions
+; CHECK-NEXT: Shadow Stack GC Lowering
+; CHECK-NEXT: Remove unreachable blocks from the CFG
+; CHECK-NEXT: Inserts calls to mcount-like functions
+; CHECK-NEXT: Scalarize Masked Memory Intrinsics
+; CHECK-NEXT: Expand reduction intrinsics
+; CHECK-NEXT: Rewrite Symbols
+; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Dominator Tree Construction
+; CHECK-NEXT: Exception handling preparation
+; CHECK-NEXT: Safe Stack instrumentation pass
+; CHECK-NEXT: Insert stack protectors
+; CHECK-NEXT: Module Verifier
+; CHECK-NEXT: X86 DAG->DAG Instruction Selection
+; CHECK-NEXT: X86 PIC Global Base Reg Initialization
+; CHECK-NEXT: Expand ISel Pseudo-instructions
+; CHECK-NEXT: Local Stack Slot Allocation
+; CHECK-NEXT: X86 WinAlloca Expander
+; CHECK-NEXT: Eliminate PHI nodes for register allocation
+; CHECK-NEXT: Two-Address instruction pass
+; CHECK-NEXT: Fast Register Allocator
+; CHECK-NEXT: Bundle Machine CFG Edges
+; CHECK-NEXT: X86 FP Stackifier
+; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization
+; CHECK-NEXT: Post-RA pseudo instruction expansion pass
+; CHECK-NEXT: X86 pseudo instruction expansion pass
+; CHECK-NEXT: Analyze Machine Code For Garbage Collection
+; CHECK-NEXT: X86 vzeroupper inserter
+; CHECK-NEXT: Contiguously Lay Out Funclets
+; CHECK-NEXT: StackMap Liveness Analysis
+; CHECK-NEXT: Live DEBUG_VALUE analysis
+; CHECK-NEXT: Insert fentry calls
+; CHECK-NEXT: MachineDominator Tree Construction
+; CHECK-NEXT: Machine Natural Loop Construction
+; CHECK-NEXT: Insert XRay ops
+; CHECK-NEXT: Implement the 'patchable-function' attribute
+; CHECK-NEXT: Lazy Machine Block Frequency Analysis
+; CHECK-NEXT: Machine Optimization Remark Emitter
+; CHECK-NEXT: MachineDominator Tree Construction
+; CHECK-NEXT: Machine Natural Loop Construction
+; CHECK-NEXT: X86 Assembly Printer
+; CHECK-NEXT: Free MachineFunction
+
+define void @f() {
+ ret void
+}
diff --git a/test/CodeGen/X86/all-ones-vector.ll b/test/CodeGen/X86/all-ones-vector.ll
index 35f488ea448c..d0160a5b84df 100644
--- a/test/CodeGen/X86/all-ones-vector.ll
+++ b/test/CodeGen/X86/all-ones-vector.ll
@@ -157,8 +157,8 @@ define <32 x i8> @allones_v32i8() nounwind {
;
; X32-AVX1-LABEL: allones_v32i8:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v32i8:
@@ -174,8 +174,8 @@ define <32 x i8> @allones_v32i8() nounwind {
;
; X64-AVX1-LABEL: allones_v32i8:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v32i8:
@@ -194,8 +194,8 @@ define <16 x i16> @allones_v16i16() nounwind {
;
; X32-AVX1-LABEL: allones_v16i16:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v16i16:
@@ -211,8 +211,8 @@ define <16 x i16> @allones_v16i16() nounwind {
;
; X64-AVX1-LABEL: allones_v16i16:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v16i16:
@@ -231,8 +231,8 @@ define <8 x i32> @allones_v8i32() nounwind {
;
; X32-AVX1-LABEL: allones_v8i32:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v8i32:
@@ -248,8 +248,8 @@ define <8 x i32> @allones_v8i32() nounwind {
;
; X64-AVX1-LABEL: allones_v8i32:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v8i32:
@@ -268,8 +268,8 @@ define <4 x i64> @allones_v4i64() nounwind {
;
; X32-AVX1-LABEL: allones_v4i64:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v4i64:
@@ -285,8 +285,8 @@ define <4 x i64> @allones_v4i64() nounwind {
;
; X64-AVX1-LABEL: allones_v4i64:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v4i64:
@@ -305,8 +305,8 @@ define <4 x double> @allones_v4f64() nounwind {
;
; X32-AVX1-LABEL: allones_v4f64:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v4f64:
@@ -322,8 +322,8 @@ define <4 x double> @allones_v4f64() nounwind {
;
; X64-AVX1-LABEL: allones_v4f64:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v4f64:
@@ -342,8 +342,8 @@ define <4 x double> @allones_v4f64_optsize() nounwind optsize {
;
; X32-AVX1-LABEL: allones_v4f64_optsize:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v4f64_optsize:
@@ -359,8 +359,8 @@ define <4 x double> @allones_v4f64_optsize() nounwind optsize {
;
; X64-AVX1-LABEL: allones_v4f64_optsize:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v4f64_optsize:
@@ -379,8 +379,8 @@ define <8 x float> @allones_v8f32() nounwind {
;
; X32-AVX1-LABEL: allones_v8f32:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v8f32:
@@ -396,8 +396,8 @@ define <8 x float> @allones_v8f32() nounwind {
;
; X64-AVX1-LABEL: allones_v8f32:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v8f32:
@@ -416,8 +416,8 @@ define <8 x float> @allones_v8f32_optsize() nounwind optsize {
;
; X32-AVX1-LABEL: allones_v8f32_optsize:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v8f32_optsize:
@@ -433,8 +433,8 @@ define <8 x float> @allones_v8f32_optsize() nounwind optsize {
;
; X64-AVX1-LABEL: allones_v8f32_optsize:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v8f32_optsize:
@@ -455,8 +455,8 @@ define <64 x i8> @allones_v64i8() nounwind {
;
; X32-AVX1-LABEL: allones_v64i8:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
@@ -487,8 +487,8 @@ define <64 x i8> @allones_v64i8() nounwind {
;
; X64-AVX1-LABEL: allones_v64i8:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
@@ -522,8 +522,8 @@ define <32 x i16> @allones_v32i16() nounwind {
;
; X32-AVX1-LABEL: allones_v32i16:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
@@ -554,8 +554,8 @@ define <32 x i16> @allones_v32i16() nounwind {
;
; X64-AVX1-LABEL: allones_v32i16:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
@@ -589,8 +589,8 @@ define <16 x i32> @allones_v16i32() nounwind {
;
; X32-AVX1-LABEL: allones_v16i32:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
@@ -615,8 +615,8 @@ define <16 x i32> @allones_v16i32() nounwind {
;
; X64-AVX1-LABEL: allones_v16i32:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
@@ -644,8 +644,8 @@ define <8 x i64> @allones_v8i64() nounwind {
;
; X32-AVX1-LABEL: allones_v8i64:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
@@ -670,8 +670,8 @@ define <8 x i64> @allones_v8i64() nounwind {
;
; X64-AVX1-LABEL: allones_v8i64:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
@@ -699,8 +699,8 @@ define <8 x double> @allones_v8f64() nounwind {
;
; X32-AVX1-LABEL: allones_v8f64:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
@@ -725,8 +725,8 @@ define <8 x double> @allones_v8f64() nounwind {
;
; X64-AVX1-LABEL: allones_v8f64:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
@@ -754,8 +754,8 @@ define <16 x float> @allones_v16f32() nounwind {
;
; X32-AVX1-LABEL: allones_v16f32:
; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
@@ -780,8 +780,8 @@ define <16 x float> @allones_v16f32() nounwind {
;
; X64-AVX1-LABEL: allones_v16f32:
; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index 2aaf14001758..aa28ef5175ed 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -135,88 +135,87 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
; SSE2-LABEL: avg_v32i8:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm8
-; SSE2-NEXT: movdqa 16(%rdi), %xmm11
+; SSE2-NEXT: movdqa (%rdi), %xmm3
+; SSE2-NEXT: movdqa 16(%rdi), %xmm8
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm8, %xmm12
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm12
; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm8, %xmm7
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm7, %xmm11
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm8, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm11, %xmm15
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm15, %xmm14
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm11, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm6, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm12, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: paddd %xmm11, %xmm6
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm7, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm13
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT: paddd %xmm10, %xmm7
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm11, %xmm1
-; SSE2-NEXT: paddd %xmm9, %xmm13
-; SSE2-NEXT: paddd %xmm15, %xmm2
-; SSE2-NEXT: paddd %xmm14, %xmm5
-; SSE2-NEXT: paddd %xmm8, %xmm0
-; SSE2-NEXT: paddd %xmm12, %xmm6
-; SSE2-NEXT: paddd %xmm10, %xmm3
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm8, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; SSE2-NEXT: paddd %xmm4, %xmm7
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: paddd %xmm4, %xmm6
-; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm4, %xmm9
; SSE2-NEXT: paddd %xmm4, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm13
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: paddd %xmm4, %xmm7
; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: packuswb %xmm7, %xmm3
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm3
; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm6, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm0
; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm4, %xmm9
; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm13
-; SSE2-NEXT: pand %xmm4, %xmm13
+; SSE2-NEXT: packuswb %xmm9, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm5, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: packuswb %xmm13, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm7, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
@@ -259,198 +258,183 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; SSE2-LABEL: avg_v64i8:
; SSE2: # BB#0:
-; SSE2-NEXT: subq $152, %rsp
-; SSE2-NEXT: .Lcfi0:
-; SSE2-NEXT: .cfi_def_cfa_offset 160
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: movdqa 16(%rdi), %xmm4
-; SSE2-NEXT: movdqa 32(%rdi), %xmm5
-; SSE2-NEXT: movdqa 48(%rdi), %xmm6
+; SSE2-NEXT: movdqa (%rdi), %xmm6
+; SSE2-NEXT: movdqa 16(%rdi), %xmm2
+; SSE2-NEXT: movdqa 32(%rdi), %xmm1
+; SSE2-NEXT: movdqa 48(%rdi), %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa (%rsi), %xmm5
+; SSE2-NEXT: movdqa 16(%rsi), %xmm13
+; SSE2-NEXT: movdqa 32(%rsi), %xmm11
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm6, %xmm12
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm2, %xmm15
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm15, %xmm14
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm10, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm4, %xmm10
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm12, %xmm3
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm6, %xmm5
; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm6, %xmm8
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm8, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa (%rsi), %xmm14
-; SSE2-NEXT: movdqa %xmm14, %xmm7
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm7, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm13, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm4, %xmm12
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm14, %xmm12
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm14, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; SSE2-NEXT: movdqa 16(%rsi), %xmm12
-; SSE2-NEXT: movdqa %xmm12, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm15, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm13, %xmm15
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm8, %xmm15
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm2, %xmm13
+; SSE2-NEXT: movdqa %xmm11, %xmm6
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm6, %xmm13
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm6, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm5, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm12, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE2-NEXT: movdqa 32(%rsi), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm11
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm7, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm11, %xmm14
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm2, %xmm14
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm1, %xmm11
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movdqa 48(%rsi), %xmm7
+; SSE2-NEXT: movdqa %xmm7, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm1, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm5, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm1, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: movdqa 48(%rsi), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: paddd %xmm8, %xmm4
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
-; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm2, %xmm7
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: paddd %xmm0, %xmm10
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm0, %xmm12
+; SSE2-NEXT: paddd %xmm0, %xmm4
; SSE2-NEXT: paddd %xmm0, %xmm15
-; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: paddd %xmm0, %xmm9
-; SSE2-NEXT: paddd %xmm0, %xmm14
; SSE2-NEXT: paddd %xmm0, %xmm13
+; SSE2-NEXT: paddd %xmm0, %xmm9
; SSE2-NEXT: paddd %xmm0, %xmm6
-; SSE2-NEXT: paddd %xmm0, %xmm10
-; SSE2-NEXT: paddd %xmm0, %xmm12
+; SSE2-NEXT: paddd %xmm0, %xmm14
; SSE2-NEXT: paddd %xmm0, %xmm11
-; SSE2-NEXT: paddd %xmm0, %xmm5
-; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm8
-; SSE2-NEXT: paddd %xmm0, %xmm4
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: psrld $1, %xmm15
+; SSE2-NEXT: paddd %xmm0, %xmm5
+; SSE2-NEXT: paddd %xmm0, %xmm7
+; SSE2-NEXT: psrld $1, %xmm10
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm10
+; SSE2-NEXT: packuswb %xmm1, %xmm10
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: packuswb %xmm1, %xmm2
+; SSE2-NEXT: packuswb %xmm10, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: psrld $1, %xmm12
+; SSE2-NEXT: pand %xmm0, %xmm12
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: packuswb %xmm12, %xmm4
+; SSE2-NEXT: psrld $1, %xmm13
+; SSE2-NEXT: psrld $1, %xmm15
; SSE2-NEXT: pand %xmm0, %xmm15
-; SSE2-NEXT: pand %xmm0, %xmm7
-; SSE2-NEXT: packuswb %xmm15, %xmm7
-; SSE2-NEXT: psrld $1, %xmm14
+; SSE2-NEXT: pand %xmm0, %xmm13
+; SSE2-NEXT: packuswb %xmm15, %xmm13
+; SSE2-NEXT: packuswb %xmm4, %xmm13
+; SSE2-NEXT: psrld $1, %xmm6
; SSE2-NEXT: psrld $1, %xmm9
; SSE2-NEXT: pand %xmm0, %xmm9
-; SSE2-NEXT: pand %xmm0, %xmm14
-; SSE2-NEXT: packuswb %xmm9, %xmm14
-; SSE2-NEXT: packuswb %xmm7, %xmm14
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm13
-; SSE2-NEXT: pand %xmm0, %xmm13
; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: packuswb %xmm13, %xmm6
-; SSE2-NEXT: psrld $1, %xmm12
-; SSE2-NEXT: psrld $1, %xmm10
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: pand %xmm0, %xmm12
-; SSE2-NEXT: packuswb %xmm10, %xmm12
-; SSE2-NEXT: packuswb %xmm6, %xmm12
-; SSE2-NEXT: psrld $1, %xmm5
+; SSE2-NEXT: packuswb %xmm9, %xmm6
; SSE2-NEXT: psrld $1, %xmm11
+; SSE2-NEXT: psrld $1, %xmm14
+; SSE2-NEXT: pand %xmm0, %xmm14
; SSE2-NEXT: pand %xmm0, %xmm11
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: packuswb %xmm11, %xmm5
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: packuswb %xmm6, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm5
-; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: packuswb %xmm5, %xmm4
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: packuswb %xmm14, %xmm11
+; SSE2-NEXT: packuswb %xmm6, %xmm11
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: packuswb %xmm8, %xmm3
+; SSE2-NEXT: psrld $1, %xmm7
; SSE2-NEXT: psrld $1, %xmm5
; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm5, %xmm1
-; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: packuswb %xmm5, %xmm7
+; SSE2-NEXT: packuswb %xmm3, %xmm7
+; SSE2-NEXT: movdqu %xmm7, (%rax)
+; SSE2-NEXT: movdqu %xmm11, (%rax)
+; SSE2-NEXT: movdqu %xmm13, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm2, (%rax)
-; SSE2-NEXT: movdqu %xmm12, (%rax)
-; SSE2-NEXT: movdqu %xmm14, (%rax)
-; SSE2-NEXT: addq $152, %rsp
; SSE2-NEXT: retq
;
; AVX2-LABEL: avg_v64i8:
@@ -464,21 +448,21 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7
-; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6
-; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5
-; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8
; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9
; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10
@@ -540,13 +524,13 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpaddd %zmm7, %zmm3, %zmm3
-; AVX512F-NEXT: vpaddd %zmm6, %zmm2, %zmm2
-; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4
; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0
; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
@@ -673,27 +657,27 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
; SSE2-LABEL: avg_v16i16:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm4
-; SSE2-NEXT: movdqa 16(%rdi), %xmm5
+; SSE2-NEXT: movdqa (%rdi), %xmm2
+; SSE2-NEXT: movdqa 16(%rdi), %xmm4
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: movdqa %xmm4, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
; SSE2-NEXT: paddd %xmm7, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm8, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
; SSE2-NEXT: paddd %xmm4, %xmm3
; SSE2-NEXT: paddd %xmm4, %xmm0
@@ -755,80 +739,79 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; SSE2-LABEL: avg_v32i16:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm10
-; SSE2-NEXT: movdqa 16(%rdi), %xmm9
-; SSE2-NEXT: movdqa 32(%rdi), %xmm11
+; SSE2-NEXT: movdqa (%rdi), %xmm4
+; SSE2-NEXT: movdqa 16(%rdi), %xmm11
+; SSE2-NEXT: movdqa 32(%rdi), %xmm10
; SSE2-NEXT: movdqa 48(%rdi), %xmm8
-; SSE2-NEXT: movdqa (%rsi), %xmm14
+; SSE2-NEXT: movdqa (%rsi), %xmm9
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm9, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm11, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm11, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
; SSE2-NEXT: movdqa %xmm8, %xmm13
; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm14, %xmm7
+; SSE2-NEXT: movdqa %xmm9, %xmm7
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm4, %xmm9
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm5, %xmm6
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm11, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm12, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm10, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm13, %xmm4
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSE2-NEXT: paddd %xmm8, %xmm3
-; SSE2-NEXT: paddd %xmm13, %xmm4
-; SSE2-NEXT: paddd %xmm11, %xmm2
-; SSE2-NEXT: paddd %xmm15, %xmm5
-; SSE2-NEXT: paddd %xmm9, %xmm1
-; SSE2-NEXT: paddd %xmm12, %xmm6
-; SSE2-NEXT: paddd %xmm10, %xmm14
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: paddd %xmm0, %xmm14
+; SSE2-NEXT: paddd %xmm0, %xmm9
; SSE2-NEXT: paddd %xmm0, %xmm6
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm5
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm4
; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: psrld $1, %xmm14
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm5
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: psrld $1, %xmm6
+; SSE2-NEXT: psrld $1, %xmm9
; SSE2-NEXT: psrld $1, %xmm7
; SSE2-NEXT: pslld $16, %xmm7
; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: pslld $16, %xmm14
-; SSE2-NEXT: psrad $16, %xmm14
-; SSE2-NEXT: packssdw %xmm7, %xmm14
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm6
+; SSE2-NEXT: pslld $16, %xmm9
+; SSE2-NEXT: psrad $16, %xmm9
+; SSE2-NEXT: packssdw %xmm7, %xmm9
; SSE2-NEXT: pslld $16, %xmm6
; SSE2-NEXT: psrad $16, %xmm6
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: packssdw %xmm6, %xmm1
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm5
; SSE2-NEXT: pslld $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: packssdw %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm4
; SSE2-NEXT: pslld $16, %xmm4
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: pslld $16, %xmm3
@@ -837,7 +820,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; SSE2-NEXT: movdqu %xmm3, (%rax)
; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm14, (%rax)
+; SSE2-NEXT: movdqu %xmm9, (%rax)
; SSE2-NEXT: retq
;
; AVX2-LABEL: avg_v32i16:
@@ -847,13 +830,13 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
@@ -884,9 +867,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
@@ -1047,88 +1030,87 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
; SSE2-LABEL: avg_v32i8_2:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm8
-; SSE2-NEXT: movdqa 16(%rdi), %xmm11
+; SSE2-NEXT: movdqa (%rdi), %xmm3
+; SSE2-NEXT: movdqa 16(%rdi), %xmm8
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm8, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm8, %xmm12
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm12
; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm8, %xmm7
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm7, %xmm11
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm8, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm11, %xmm15
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm15, %xmm14
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm11, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm6, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm12, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: paddd %xmm11, %xmm6
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm7, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm13
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT: paddd %xmm10, %xmm7
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: paddd %xmm11, %xmm1
-; SSE2-NEXT: paddd %xmm9, %xmm13
-; SSE2-NEXT: paddd %xmm15, %xmm2
-; SSE2-NEXT: paddd %xmm14, %xmm5
-; SSE2-NEXT: paddd %xmm8, %xmm0
-; SSE2-NEXT: paddd %xmm12, %xmm6
-; SSE2-NEXT: paddd %xmm10, %xmm3
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm8, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; SSE2-NEXT: paddd %xmm4, %xmm7
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: paddd %xmm4, %xmm6
-; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm4, %xmm9
; SSE2-NEXT: paddd %xmm4, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm13
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: paddd %xmm4, %xmm7
; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: psrld $1, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: packuswb %xmm7, %xmm3
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm3
; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm6, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm0
; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm4, %xmm9
; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm13
-; SSE2-NEXT: pand %xmm4, %xmm13
+; SSE2-NEXT: packuswb %xmm9, %xmm2
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm5, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm7
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: packuswb %xmm13, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm7, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
@@ -1512,27 +1494,27 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
; SSE2-LABEL: avg_v16i16_2:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm4
-; SSE2-NEXT: movdqa 16(%rdi), %xmm5
+; SSE2-NEXT: movdqa (%rdi), %xmm2
+; SSE2-NEXT: movdqa 16(%rdi), %xmm4
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: movdqa %xmm4, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
; SSE2-NEXT: paddd %xmm7, %xmm2
-; SSE2-NEXT: paddd %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm8, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
; SSE2-NEXT: paddd %xmm4, %xmm3
; SSE2-NEXT: paddd %xmm4, %xmm0
@@ -1594,80 +1576,79 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; SSE2-LABEL: avg_v32i16_2:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm10
-; SSE2-NEXT: movdqa 16(%rdi), %xmm9
-; SSE2-NEXT: movdqa 32(%rdi), %xmm11
+; SSE2-NEXT: movdqa (%rdi), %xmm4
+; SSE2-NEXT: movdqa 16(%rdi), %xmm11
+; SSE2-NEXT: movdqa 32(%rdi), %xmm10
; SSE2-NEXT: movdqa 48(%rdi), %xmm8
-; SSE2-NEXT: movdqa (%rsi), %xmm14
+; SSE2-NEXT: movdqa (%rsi), %xmm9
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm9, %xmm12
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm11, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm11, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
; SSE2-NEXT: movdqa %xmm8, %xmm13
; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm14, %xmm7
+; SSE2-NEXT: movdqa %xmm9, %xmm7
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm4, %xmm9
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm5, %xmm6
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm11, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm5
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm12, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: paddd %xmm10, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT: paddd %xmm13, %xmm4
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSE2-NEXT: paddd %xmm8, %xmm3
-; SSE2-NEXT: paddd %xmm13, %xmm4
-; SSE2-NEXT: paddd %xmm11, %xmm2
-; SSE2-NEXT: paddd %xmm15, %xmm5
-; SSE2-NEXT: paddd %xmm9, %xmm1
-; SSE2-NEXT: paddd %xmm12, %xmm6
-; SSE2-NEXT: paddd %xmm10, %xmm14
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: paddd %xmm0, %xmm14
+; SSE2-NEXT: paddd %xmm0, %xmm9
; SSE2-NEXT: paddd %xmm0, %xmm6
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm5
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: paddd %xmm0, %xmm4
; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: psrld $1, %xmm14
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: psrld $1, %xmm5
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: psrld $1, %xmm6
+; SSE2-NEXT: psrld $1, %xmm9
; SSE2-NEXT: psrld $1, %xmm7
; SSE2-NEXT: pslld $16, %xmm7
; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: pslld $16, %xmm14
-; SSE2-NEXT: psrad $16, %xmm14
-; SSE2-NEXT: packssdw %xmm7, %xmm14
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm6
+; SSE2-NEXT: pslld $16, %xmm9
+; SSE2-NEXT: psrad $16, %xmm9
+; SSE2-NEXT: packssdw %xmm7, %xmm9
; SSE2-NEXT: pslld $16, %xmm6
; SSE2-NEXT: psrad $16, %xmm6
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: packssdw %xmm6, %xmm1
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm5
; SSE2-NEXT: pslld $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: packssdw %xmm5, %xmm2
-; SSE2-NEXT: psrld $1, %xmm3
-; SSE2-NEXT: psrld $1, %xmm4
; SSE2-NEXT: pslld $16, %xmm4
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: pslld $16, %xmm3
@@ -1676,7 +1657,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; SSE2-NEXT: movdqu %xmm3, (%rax)
; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
-; SSE2-NEXT: movdqu %xmm14, (%rax)
+; SSE2-NEXT: movdqu %xmm9, (%rax)
; SSE2-NEXT: retq
;
; AVX2-LABEL: avg_v32i16_2:
@@ -1686,13 +1667,13 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
@@ -1723,9 +1704,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index e6cc95fcdb23..6869d088e7cd 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -34,8 +34,8 @@ define void @zero256() nounwind ssp {
define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind {
; CHECK-LABEL: ones:
; CHECK: ## BB#0: ## %allocas
-; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -51,8 +51,8 @@ float>* %ptr2vec615, align 32
define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwind {
; CHECK-LABEL: ones2:
; CHECK: ## BB#0: ## %allocas
-; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx-cvt-3.ll b/test/CodeGen/X86/avx-cvt-3.ll
index 066719b3bfe8..231334ddcb85 100644
--- a/test/CodeGen/X86/avx-cvt-3.ll
+++ b/test/CodeGen/X86/avx-cvt-3.ll
@@ -48,16 +48,16 @@ define <8 x float> @sitofp_shuffle_zero_v8i32(<8 x i32> %a0) {
define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) {
; X86-LABEL: sitofp_insert_allbits_v8i32:
; X86: # BB#0:
-; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: sitofp_insert_allbits_v8i32:
; X64: # BB#0:
-; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
; X64-NEXT: retq
@@ -72,16 +72,16 @@ define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) {
define <8 x float> @sitofp_shuffle_allbits_v8i32(<8 x i32> %a0) {
; X86-LABEL: sitofp_shuffle_allbits_v8i32:
; X86: # BB#0:
-; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: sitofp_shuffle_allbits_v8i32:
; X64: # BB#0:
-; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
; X64-NEXT: retq
@@ -95,8 +95,7 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
; X86: # BB#0:
; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: movl $2, %eax
@@ -111,8 +110,7 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
; X64: # BB#0:
; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: movl $2, %eax
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 1d925ff8e9bd..3cadbe2a8db3 100644
--- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -99,16 +99,16 @@ define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind
define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_andnot_pd:
; X32: # BB#0:
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; X32-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; X32-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
; X32-NEXT: vxorps %ymm2, %ymm0, %ymm0
; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_andnot_pd:
; X64: # BB#0:
-; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X64-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; X64-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; X64-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
; X64-NEXT: vxorps %ymm2, %ymm0, %ymm0
; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
@@ -2244,11 +2244,11 @@ define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, doubl
; X32: # BB#0:
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; X32-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_pd:
@@ -2269,19 +2269,19 @@ define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3
; X32: # BB#0:
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
-; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
-; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
-; X32-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_ps:
@@ -2881,10 +2881,10 @@ define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, doub
; X32: # BB#0:
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
@@ -2908,16 +2908,16 @@ define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a
; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
-; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
-; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
-; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X32-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index 052cacfea4dc..bb05481e313d 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -2837,4 +2837,54 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
ret <8 x float> %8
}
+define void @test_zeroall() {
+; SANDY-LABEL: test_zeroall:
+; SANDY: # BB#0:
+; SANDY-NEXT: vzeroall # sched: [?:0.000000e+00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_zeroall:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vzeroall # sched: [1:0.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_zeroall:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vzeroall # sched: [?:0.000000e+00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_zeroall:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vzeroall # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ call void @llvm.x86.avx.vzeroall()
+ ret void
+}
+declare void @llvm.x86.avx.vzeroall() nounwind
+
+define void @test_zeroupper() {
+; SANDY-LABEL: test_zeroupper:
+; SANDY: # BB#0:
+; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; SANDY-NEXT: retq # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_zeroupper:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
+; HASWELL-NEXT: retq # sched: [1:1.00]
+;
+; BTVER2-LABEL: test_zeroupper:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_zeroupper:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; ZNVER1-NEXT: retq # sched: [4:1.00]
+ call void @llvm.x86.avx.vzeroupper()
+ ret void
+}
+declare void @llvm.x86.avx.vzeroupper() nounwind
+
!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
index 341dd867e4ff..647b7a8f4dfc 100644
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll
@@ -113,11 +113,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; CHECK-NOT: mov
; CHECK: insertps $48
; CHECK: insertps $48
+; CHECK: vaddps
; CHECK: insertps $48
; CHECK: insertps $48
; CHECK: vaddps
; CHECK: vaddps
-; CHECK: vaddps
; CHECK-NEXT: ret
%1 = getelementptr inbounds float, float* %fb, i64 %index
%2 = load float, float* %1, align 4
diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index 63b0281a7339..e29cf09718ad 100644
--- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -13,10 +13,10 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float
; CHECK: # BB#0: # %entry
; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0
; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1
-; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k2
-; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k3
; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: korw %k3, %k2, %k1
+; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1
+; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2
+; CHECK-NEXT: korw %k2, %k1, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 4890afec2164..c03623a2f035 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -852,16 +852,16 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b
; CHECK-NEXT: kxorw %k0, %k0, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2
; CHECK-NEXT: movw $1, %ax
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vmovaps %zmm1, %zmm4
-; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
; CHECK-NEXT: movw $220, %ax
; CHECK-NEXT: kmovd %eax, %k1
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
-; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
-; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
%res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 0e7a8d25c56f..56962ca2671d 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float>
; CHECK-NEXT: vbroadcastss %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
@@ -30,8 +30,8 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double
; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
@@ -51,8 +51,8 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32>
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
@@ -71,8 +71,8 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
@@ -91,8 +91,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16
; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
@@ -111,8 +111,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16
; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
@@ -131,8 +131,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x
; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
@@ -671,9 +671,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -1616,9 +1616,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x
; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
%res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
@@ -2031,8 +2031,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
@@ -2051,8 +2051,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1,
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
@@ -2651,8 +2651,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3)
@@ -2989,9 +2989,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
%res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
@@ -3010,9 +3010,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
@@ -3030,9 +3030,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
%res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
@@ -3050,9 +3050,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index cc5e9e038e0b..f800d01064ba 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -274,11 +274,11 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1
-; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
%res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
@@ -301,11 +301,11 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
; CHECK-NEXT: vmovapd %xmm2, %xmm3
; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1
-; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
%res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
@@ -477,11 +477,11 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx
-; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax
-; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtsd2usi %xmm0, %rax
+; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
+; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
%res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4)
@@ -496,11 +496,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2si %xmm0, %rcx
-; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax
-; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtsd2si %xmm0, %rax
+; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
+; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
%res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4)
@@ -515,11 +515,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2usi %xmm0, %rcx
-; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax
-; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtss2usi %xmm0, %rax
+; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
+; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
%res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4)
@@ -534,11 +534,11 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2si64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2si %xmm0, %rcx
-; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax
-; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: vcvtss2si %xmm0, %rax
+; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT: addq %rax, %rcx
+; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
-; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
%res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4)
@@ -553,11 +553,11 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx
-; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax
-; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtsd2usi %xmm0, %eax
+; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
@@ -572,11 +572,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2si %xmm0, %ecx
-; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax
-; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtsd2si %xmm0, %eax
+; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
@@ -591,11 +591,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2usi %xmm0, %ecx
-; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax
-; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtss2usi %xmm0, %eax
+; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
@@ -610,11 +610,11 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2si32:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2si %xmm0, %ecx
-; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax
-; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: vcvtss2si %xmm0, %eax
+; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
@@ -683,8 +683,9 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
-; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
@@ -3656,11 +3657,11 @@ define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float>
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1
-; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
%res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
@@ -3684,10 +3685,10 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x dou
; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1
; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
%res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
@@ -3903,11 +3904,11 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d
; CHECK-NEXT: vmovapd %xmm2, %xmm3
; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
-; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5
+; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4
; CHECK-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0
-; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
@@ -3928,11 +3929,11 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x flo
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4
+; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm2
+; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3
; CHECK-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
@@ -4434,8 +4435,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vprold $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vprold $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
@@ -4454,8 +4455,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vprolq $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vprolq $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
@@ -4556,9 +4557,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <
; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3
; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
%res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
@@ -4579,9 +4580,9 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0,
; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
; CHECK-NEXT: vmovapd %zmm0, %zmm5
; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3
; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
%res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
@@ -4603,9 +4604,9 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vmovaps %xmm0, %xmm5
; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm3
; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
@@ -4650,9 +4651,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <
; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
; CHECK-NEXT: vmovaps %zmm0, %zmm5
; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
+; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3
; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
%res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
@@ -4721,9 +4722,9 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x
; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vmovapd %xmm0, %xmm5
; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm3
; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
@@ -4821,12 +4822,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1}
; CHECK-NEXT: vmovapd %xmm0, %xmm4
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4
-; CHECK-NEXT: vmovapd %xmm0, %xmm5
-; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT: vmovapd %xmm0, %xmm4
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm1
-; CHECK-NEXT: vaddpd %xmm5, %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -4849,12 +4850,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1}
; CHECK-NEXT: vmovaps %xmm0, %xmm4
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4
-; CHECK-NEXT: vmovaps %xmm0, %xmm5
-; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm1
-; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -4909,12 +4910,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d
; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT: vmovapd %xmm2, %xmm5
-; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT: vmovapd %xmm2, %xmm4
+; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -4937,12 +4938,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo
; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm4
; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4
-; CHECK-NEXT: vmovaps %xmm2, %xmm5
-; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -5069,12 +5070,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d
; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT: vmovapd %xmm2, %xmm5
-; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT: vmovapd %xmm2, %xmm4
+; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -5097,12 +5098,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo
; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm4
; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4
-; CHECK-NEXT: vmovaps %xmm2, %xmm5
-; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
@@ -5125,12 +5126,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x
; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm4
; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT: vmovapd %xmm2, %xmm5
-; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3
+; CHECK-NEXT: vmovapd %xmm2, %xmm4
+; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
@@ -5153,12 +5154,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm4
; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4
-; CHECK-NEXT: vmovaps %xmm2, %xmm5
-; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3
+; CHECK-NEXT: vmovaps %xmm2, %xmm4
+; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1}
; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2
-; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
-; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll
index 4ef88ac495c3..96aefdb10584 100644
--- a/test/CodeGen/X86/avx512-mask-spills.ll
+++ b/test/CodeGen/X86/avx512-mask-spills.ll
@@ -9,13 +9,11 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) {
; CHECK-NEXT: Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
-; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: vpmovm2d %k0, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@@ -34,14 +32,12 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) {
; CHECK-NEXT: Lcfi1:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
-; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@@ -60,14 +56,12 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) {
; CHECK-NEXT: Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
-; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: vpmovm2b %k0, %xmm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@@ -85,14 +79,12 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
; CHECK-NEXT: Lcfi3:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
+; CHECK-NEXT: kord %k1, %k0, %k0
; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
-; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload
-; CHECK-NEXT: kord %k1, %k0, %k0
; CHECK-NEXT: vpmovm2b %k0, %ymm0
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
@@ -106,20 +98,18 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) {
; CHECK-LABEL: test_64i1:
; CHECK: ## BB#0:
-; CHECK-NEXT: subq $24, %rsp
+; CHECK-NEXT: pushq %rax
; CHECK-NEXT: Lcfi4:
-; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
+; CHECK-NEXT: korq %k1, %k0, %k0
+; CHECK-NEXT: kmovq %k0, (%rsp) ## 8-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
-; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
-; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
-; CHECK-NEXT: korq %k1, %k0, %k0
+; CHECK-NEXT: kmovq (%rsp), %k0 ## 8-byte Reload
; CHECK-NEXT: vpmovm2b %k0, %zmm0
-; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
%cmp_res = icmp ugt <64 x i8> %a, %b
diff --git a/test/CodeGen/X86/avx512-scalar_mask.ll b/test/CodeGen/X86/avx512-scalar_mask.ll
new file mode 100644
index 000000000000..47c6813fa8dc
--- /dev/null
+++ b/test/CodeGen/X86/avx512-scalar_mask.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) {
+; CHECK-LABEL: test_var_mask:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4)
+ ret < 4 x float> %res
+}
+
+define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) {
+; CHECK-LABEL: test_var_maskz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4)
+ ret < 4 x float> %res
+}
+
+; FIXME: we should just return %xmm0 here.
+define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const0_mask:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kxorw %k0, %k0, %k1
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4)
+ ret < 4 x float> %res
+}
+
+; FIXME: we should zero the lower element of xmm0 and return it.
+define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const0_maskz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kxorw %k0, %k0, %k1
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4)
+ ret < 4 x float> %res
+}
+
+; FIXME: we should just return %xmm0 here.
+define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const2_mask:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kxorw %k0, %k0, %k1
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4)
+ ret < 4 x float> %res
+}
+
+; FIXME: we should zero the lower element of xmm0 and return it.
+define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const2_maskz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kxorw %k0, %k0, %k1
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4)
+ ret < 4 x float> %res
+}
+
+define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const_allone_mask:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4)
+ ret < 4 x float> %res
+}
+
+define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const_allone_maskz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4)
+ ret < 4 x float> %res
+}
+
+define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const_3_mask:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4)
+ ret < 4 x float> %res
+}
+
+define <4 x float>@test_const_3_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: test_const_3_maskz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4)
+ ret < 4 x float> %res
+}
diff --git a/test/CodeGen/X86/avx512-vselect.ll b/test/CodeGen/X86/avx512-vselect.ll
new file mode 100644
index 000000000000..1940864824ff
--- /dev/null
+++ b/test/CodeGen/X86/avx512-vselect.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=skx | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
+; RUN: llc < %s -mcpu=knl | FileCheck %s --check-prefixes=CHECK,CHECK-KNL
+
+target triple = "x86_64-unknown-unknown"
+
+define <8 x i64> @test1(<8 x i64> %m, <8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0
+; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1
+; CHECK-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+entry:
+ %m.trunc = trunc <8 x i64> %m to <8 x i1>
+ %ret = select <8 x i1> %m.trunc, <8 x i64> %a, <8 x i64> %b
+ ret <8 x i64> %ret
+}
+
+; This is a very contrived test case to trick the legalizer into splitting the
+; v16i1 masks in the select during type legalization, and in so doing extend them
+; into two v8i64 types. This lets us ensure that the lowering code can handle
+; both formulations of vselect. All of this trickery is because we can't
+; directly form an SDAG input to the lowering.
+define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) {
+; CHECK-SKX-LABEL: test2:
+; CHECK-SKX: # BB#0: # %entry
+; CHECK-SKX-NEXT: vxorps %zmm6, %zmm6, %zmm6
+; CHECK-SKX-NEXT: vcmpltps %zmm0, %zmm6, %k0
+; CHECK-SKX-NEXT: vcmpltps %zmm6, %zmm1, %k1
+; CHECK-SKX-NEXT: korw %k1, %k0, %k0
+; CHECK-SKX-NEXT: kshiftrw $8, %k0, %k1
+; CHECK-SKX-NEXT: vpmovm2q %k1, %zmm1
+; CHECK-SKX-NEXT: vpmovm2q %k0, %zmm0
+; CHECK-SKX-NEXT: vptestmq %zmm0, %zmm0, %k1
+; CHECK-SKX-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
+; CHECK-SKX-NEXT: vptestmq %zmm1, %zmm1, %k1
+; CHECK-SKX-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
+; CHECK-SKX-NEXT: retq
+;
+; CHECK-KNL-LABEL: test2:
+; CHECK-KNL: # BB#0: # %entry
+; CHECK-KNL-NEXT: vpxord %zmm6, %zmm6, %zmm6
+; CHECK-KNL-NEXT: vcmpltps %zmm0, %zmm6, %k0
+; CHECK-KNL-NEXT: vcmpltps %zmm6, %zmm1, %k1
+; CHECK-KNL-NEXT: korw %k1, %k0, %k1
+; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k2
+; CHECK-KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; CHECK-KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; CHECK-KNL-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
+; CHECK-KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; CHECK-KNL-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
+; CHECK-KNL-NEXT: retq
+entry:
+ %gt.m = fcmp ogt <16 x float> %x, zeroinitializer
+ %lt.m = fcmp olt <16 x float> %y, zeroinitializer
+ %m.or = or <16 x i1> %gt.m, %lt.m
+ %ret = select <16 x i1> %m.or, <16 x double> %a, <16 x double> %b
+ ret <16 x double> %ret
+}
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 9b4e73a18fc2..faa055dfbbf3 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -796,9 +796,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512:
@@ -806,9 +806,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -826,8 +826,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1,
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
@@ -836,8 +836,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1,
; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 3337f42eb142..13b850ccc3b6 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2159,9 +2159,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
@@ -2169,9 +2169,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
@@ -2411,9 +2411,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
@@ -2421,9 +2421,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index 7df07b0413ed..571f345d4616 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8]
-; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9]
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
@@ -29,8 +29,8 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
-; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
@@ -49,8 +49,8 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16>
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
-; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
@@ -69,8 +69,8 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
-; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
@@ -89,8 +89,8 @@ define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x
; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0]
; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
-; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
+; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
@@ -109,8 +109,8 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16>
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
-; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
+; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
@@ -1476,9 +1476,9 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3]
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1]
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb]
-; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -1496,9 +1496,9 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16>
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3]
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1]
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -1596,8 +1596,8 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8
; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
-; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
+; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
@@ -1616,8 +1616,8 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1,
; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
-; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
+; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
index 8f528394f5bd..f8f47c87100a 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
@@ -9,8 +9,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32>
; CHECK-NEXT: vplzcntd %xmm0, %xmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index 37aea45e6107..96254f7c95b0 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -7,8 +7,8 @@ define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32
; CHECK-NEXT: vplzcntd %xmm0, %xmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
index c5478dad4224..1377733739fe 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@@ -40,8 +40,8 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x
; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1}
-; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
index 000390404b54..97ac0fde10ec 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -414,8 +414,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0,
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3)
@@ -434,8 +434,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3)
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
index 52a84deebf51..595b3e0ebb86 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@@ -1568,8 +1568,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0,
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
@@ -1588,9 +1588,9 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3]
; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01]
-; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb]
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
+; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
@@ -1608,9 +1608,9 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3]
; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01]
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index ad9ea93c2031..1bfdfd0e634d 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -635,8 +635,8 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0,
; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0]
; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0]
; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0]
; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3)
@@ -680,8 +680,8 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8]
; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0]
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3)
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
index ca130bd2b676..b8531e25bfa1 100644
--- a/test/CodeGen/X86/avx512er-intrinsics.ll
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -118,78 +118,78 @@ define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
}
declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) {
+define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_ss_maskz:
; CHECK: # BB#0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
- %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ;
+ %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 %mask, i32 8) ;
ret <4 x float> %res
}
-define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) {
+define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_ss_mask:
; CHECK: # BB#0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
; CHECK-NEXT: retq # encoding: [0xc3]
- %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ;
+ %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask, i32 8) ;
ret <4 x float> %res
}
-define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) {
+define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_sd_maskz:
; CHECK: # BB#0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
- %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ;
+ %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 8) ;
ret <2 x double> %res
}
-define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0) {
+define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_sd_mask:
; CHECK: # BB#0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1]
; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
; CHECK-NEXT: retq # encoding: [0xc3]
- %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 7, i32 8) ;
+ %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask, i32 8) ;
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
-define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) {
+define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_sd_maskz_mem:
; CHECK: # BB#0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT: andl $1, %esi # encoding: [0x83,0xe6,0x01]
+; CHECK-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07]
; CHECK-NEXT: retq # encoding: [0xc3]
%mem = load double , double * %ptr, align 8
%mem_v = insertelement <2 x double> undef, double %mem, i32 0
- %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ;
+ %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
ret <2 x double> %res
}
-define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) {
+define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_sd_maskz_mem_offset:
; CHECK: # BB#0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
-; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT: andl $1, %esi # encoding: [0x83,0xe6,0x01]
+; CHECK-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12]
; CHECK-NEXT: retq # encoding: [0xc3]
%ptr1 = getelementptr double, double* %ptr, i32 18
%mem = load double , double * %ptr1, align 8
%mem_v = insertelement <2 x double> undef, double %mem, i32 0
- %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ;
+ %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
ret <2 x double> %res
}
diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll
index 30ecc0d2e49e..9659dc6d455a 100644
--- a/test/CodeGen/X86/avx512ifma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll
@@ -13,8 +13,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -41,8 +41,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -69,8 +69,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -97,8 +97,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
index 3ca686cef3bf..b2fe6eba88ab 100644
--- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
@@ -14,8 +14,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -42,8 +42,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -98,8 +98,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -126,8 +126,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -154,8 +154,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -182,8 +182,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -210,8 +210,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index 4d906a4fd29a..c2d8df6476b3 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -30,8 +30,8 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x
; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8]
-; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0]
; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
+; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
@@ -50,8 +50,8 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x
; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8]
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0]
; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9]
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0]
; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
@@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x
; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8]
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0]
; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9]
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0]
; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
@@ -90,8 +90,8 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8]
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0]
; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9]
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0]
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1)
@@ -110,8 +110,8 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %
; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8]
-; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0]
; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9]
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0]
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1)
@@ -130,8 +130,8 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %
; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8]
-; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0]
; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9]
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0]
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
@@ -152,9 +152,9 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x f
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8]
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2]
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2]
-; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
@@ -175,9 +175,9 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x f
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8]
; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
@@ -198,9 +198,9 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x f
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8]
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3]
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3]
-; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
@@ -221,9 +221,9 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8]
; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
@@ -243,9 +243,9 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8]
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0]
+; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0]
-; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
@@ -266,9 +266,9 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8]
; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2]
+; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca]
; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2]
-; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca]
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
@@ -3209,10 +3209,10 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01]
; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0]
+; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3]
; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0]
-; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb]
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
+; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1)
@@ -3540,9 +3540,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x
; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3]
; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1]
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb]
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -3560,9 +3560,9 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x
; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3]
; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1]
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -3580,9 +3580,9 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3]
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1]
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -3600,9 +3600,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x
; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3]
; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1]
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
-; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -3720,8 +3720,8 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2
; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03]
-; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03]
; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
+; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03]
; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -3740,8 +3740,8 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4
; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03]
-; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03]
; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
+; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03]
; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -3760,8 +3760,8 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4
; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03]
-; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03]
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -3780,8 +3780,8 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8
; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03]
-; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03]
; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
+; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4642,10 +4642,10 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32>
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02]
; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3]
; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1]
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1)
@@ -4817,9 +4817,9 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3]
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01]
-; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb]
-; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
+; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
@@ -4837,9 +4837,9 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3]
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01]
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
-; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 1f324d679564..684b0468cf51 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03]
; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03]
-; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -4388,8 +4388,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03]
; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03]
-; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
+; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4408,8 +4408,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03]
; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03]
-; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
+; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -4428,8 +4428,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03]
; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03]
-; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
+; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03]
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -4528,8 +4528,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03]
; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03]
-; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
@@ -4548,8 +4548,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03]
; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03]
-; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
+; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
@@ -4568,8 +4568,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03]
; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03]
-; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
+; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
@@ -4588,8 +4588,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03]
; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03]
-; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
+; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03]
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
@@ -4690,9 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <
; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05]
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04]
+; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc]
; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03]
-; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc]
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4)
@@ -4732,9 +4732,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <
; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04]
; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05]
+; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc]
; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
-; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc]
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4)
@@ -4755,9 +4755,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0,
; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8]
; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04]
+; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd]
; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
-; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd]
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4)
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index afeba4ef2d99..94e2ee7a0aa9 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -454,6 +454,30 @@ entry:
ret i32 %and
}
+define i32 @bzhi32d(i32 %a, i32 %b) {
+; CHECK-LABEL: bzhi32d:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %sub = sub i32 32, %b
+ %shr = lshr i32 -1, %sub
+ %and = and i32 %shr, %a
+ ret i32 %and
+}
+
+define i32 @bzhi32e(i32 %a, i32 %b) {
+; CHECK-LABEL: bzhi32e:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %sub = sub i32 32, %b
+ %shl = shl i32 %a, %sub
+ %shr = lshr i32 %shl, %sub
+ ret i32 %shr
+}
+
define i64 @bzhi64b(i64 %x, i8 zeroext %index) {
; CHECK-LABEL: bzhi64b:
; CHECK: # BB#0: # %entry
@@ -468,6 +492,58 @@ entry:
ret i64 %and
}
+define i64 @bzhi64c(i64 %a, i64 %b) {
+; CHECK-LABEL: bzhi64c:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
+entry:
+ %sub = sub i64 64, %b
+ %shr = lshr i64 -1, %sub
+ %and = and i64 %shr, %a
+ ret i64 %and
+}
+
+define i64 @bzhi64d(i64 %a, i32 %b) {
+; CHECK-LABEL: bzhi64d:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
+entry:
+ %sub = sub i32 64, %b
+ %sh_prom = zext i32 %sub to i64
+ %shr = lshr i64 -1, %sh_prom
+ %and = and i64 %shr, %a
+ ret i64 %and
+}
+
+define i64 @bzhi64e(i64 %a, i64 %b) {
+; CHECK-LABEL: bzhi64e:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
+entry:
+ %sub = sub i64 64, %b
+ %shl = shl i64 %a, %sub
+ %shr = lshr i64 %shl, %sub
+ ret i64 %shr
+}
+
+define i64 @bzhi64f(i64 %a, i32 %b) {
+; CHECK-LABEL: bzhi64f:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
+entry:
+ %sub = sub i32 64, %b
+ %sh_prom = zext i32 %sub to i64
+ %shl = shl i64 %a, %sh_prom
+ %shr = lshr i64 %shl, %sh_prom
+ ret i64 %shr
+}
+
define i64 @bzhi64_constant_mask(i64 %x) {
; CHECK-LABEL: bzhi64_constant_mask:
; CHECK: # BB#0: # %entry
diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll
index a9c74df9d0d9..1340b7662a7a 100644
--- a/test/CodeGen/X86/bswap_tree2.ll
+++ b/test/CodeGen/X86/bswap_tree2.ll
@@ -9,31 +9,32 @@
define i32 @test1(i32 %x) nounwind {
; CHECK-LABEL: test1:
; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000
-; CHECK-NEXT: shll $8, %edx
-; CHECK-NEXT: shrl $8, %eax
-; CHECK-NEXT: bswapl %ecx
-; CHECK-NEXT: shrl $16, %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000
+; CHECK-NEXT: shll $8, %ecx
+; CHECK-NEXT: shrl $8, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: bswapl %eax
+; CHECK-NEXT: shrl $16, %eax
; CHECK-NEXT: orl %edx, %eax
-; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: test1:
; CHECK64: # BB#0:
-; CHECK64-NEXT: movl %edi, %ecx
-; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000
; CHECK64-NEXT: movl %edi, %eax
-; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000
-; CHECK64-NEXT: shll $8, %ecx
-; CHECK64-NEXT: shrl $8, %eax
+; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000
+; CHECK64-NEXT: movl %edi, %ecx
+; CHECK64-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
+; CHECK64-NEXT: shll $8, %eax
+; CHECK64-NEXT: shrl $8, %ecx
+; CHECK64-NEXT: orl %eax, %ecx
; CHECK64-NEXT: bswapl %edi
; CHECK64-NEXT: shrl $16, %edi
-; CHECK64-NEXT: orl %ecx, %eax
-; CHECK64-NEXT: orl %edi, %eax
+; CHECK64-NEXT: orl %ecx, %edi
+; CHECK64-NEXT: movl %edi, %eax
; CHECK64-NEXT: retq
%byte0 = and i32 %x, 255 ; 0x000000ff
%byte1 = and i32 %x, 65280 ; 0x0000ff00
diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll
index 1e44aec99fc5..83ab2fac2f16 100644
--- a/test/CodeGen/X86/cast-vsel.ll
+++ b/test/CodeGen/X86/cast-vsel.ll
@@ -200,32 +200,29 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
; SSE41: # BB#0:
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm1, %xmm5
-; SSE41-NEXT: pshufb %xmm1, %xmm4
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
; SSE41-NEXT: pshufb %xmm1, %xmm3
; SSE41-NEXT: pshufb %xmm1, %xmm2
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE41-NEXT: pand %xmm0, %xmm2
-; SSE41-NEXT: pandn %xmm4, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pshufb %xmm1, %xmm5
+; SSE41-NEXT: pshufb %xmm1, %xmm4
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc:
; AVX1: # BB#0:
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
-; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -233,13 +230,11 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
; AVX2: # BB#0:
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cmp = icmp eq <8 x i16> %a, %b
diff --git a/test/CodeGen/X86/combine-abs.ll b/test/CodeGen/X86/combine-abs.ll
index 887abe99f6ed..37beb438d737 100644
--- a/test/CodeGen/X86/combine-abs.ll
+++ b/test/CodeGen/X86/combine-abs.ll
@@ -50,12 +50,11 @@ define <32 x i8> @combine_v32i8_abs_abs(<32 x i8> %a) {
define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) {
; AVX2-LABEL: combine_v4i64_abs_abs:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll
index 3ad38f2717d9..3dbff2680c22 100644
--- a/test/CodeGen/X86/combine-shl.ll
+++ b/test/CodeGen/X86/combine-shl.ll
@@ -11,8 +11,7 @@ define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
;
; AVX-LABEL: combine_vec_shl_zero:
; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsllvd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shl <4 x i32> zeroinitializer, %x
ret <4 x i32> %1
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 706e89051a3d..21564cdd7353 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -6,30 +6,12 @@
define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_zero:
; SSE: # BB#0:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: psrld %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: psrld %xmm2, %xmm4
-; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: psrld %xmm0, %xmm3
-; SSE-NEXT: psrld %xmm2, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_zero:
; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsrlvd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = lshr <4 x i32> zeroinitializer, %x
ret <4 x i32> %1
diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
index 7b82125dc372..2f3c343afac0 100644
--- a/test/CodeGen/X86/constructor.ll
+++ b/test/CodeGen/X86/constructor.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=INIT-ARRAY %s
; RUN: llc -mtriple x86_64-unknown-freebsd < %s | FileCheck --check-prefix=INIT-ARRAY %s
; RUN: llc -mtriple x86_64-unknown-nacl < %s | FileCheck --check-prefix=NACL %s
+; RUN: llc -mtriple i586-intel-elfiamcu -use-ctors < %s | FileCheck %s --check-prefix=MCU-CTORS
+; RUN: llc -mtriple i586-intel-elfiamcu < %s | FileCheck %s --check-prefix=MCU-INIT-ARRAY
@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }]
@v = weak_odr global i8 0
@@ -37,3 +39,6 @@ entry:
; NACL-NEXT: .section .init_array,"aw",@init_array
; NACL-NEXT: .p2align 2
; NACL-NEXT: .long f
+
+; MCU-CTORS: .section .ctors,"aw",@progbits
+; MCU-INIT-ARRAY: .section .init_array,"aw",@init_array
diff --git a/test/CodeGen/X86/dbg-baseptr.ll b/test/CodeGen/X86/dbg-baseptr.ll
index fb0da1b50d11..893ca93a9944 100644
--- a/test/CodeGen/X86/dbg-baseptr.ll
+++ b/test/CodeGen/X86/dbg-baseptr.ll
@@ -1,4 +1,5 @@
; RUN: llc -o - %s | FileCheck %s
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s --check-prefix=DWARF
; This test checks that parameters on the stack pointer are correctly
; referenced by debug info.
target triple = "x86_64--"
@@ -7,24 +8,54 @@ target triple = "x86_64--"
@ptr = external global i32*
%struct.s = type { i32, i32, i32, i32, i32 }
+; Simple case: no FP, use offset from RSP.
+
; CHECK-LABEL: f0:
-; CHECK: DEBUG_VALUE: f:input <- [%RSP+8]
+; CHECK-NOT: pushq
+; CHECK: movl $42, %eax
+; CHECK: retq
define i32 @f0(%struct.s* byval align 8 %input) !dbg !8 {
call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !4, metadata !17), !dbg !18
- ret i32 42
+ ret i32 42, !dbg !18
}
+; DWARF-LABEL: .debug_info contents:
+
+; DWARF-LABEL: DW_TAG_subprogram
+; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 57 )
+; 0x57 -> RSP
+; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f0")
+; DWARF: DW_TAG_formal_parameter
+; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 08 )
+; DW_OP_fbreg (0x91) 0x08
+; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input")
+
+
+; Dynamic alloca forces the use of RBP as the base pointer
+
; CHECK-LABEL: f1:
-; CHECK: DEBUG_VALUE: f:input <- [%RBP+16]
+; CHECK: pushq %rbp
+; CHECK: movl $42, %eax
+; CHECK: popq %rbp
+; CHECK: retq
define i32 @f1(%struct.s* byval align 8 %input) !dbg !19 {
%val = load i64, i64* @glob
; this alloca should force FP usage.
%stackspace = alloca i32, i64 %val, align 1
store i32* %stackspace, i32** @ptr
call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !20, metadata !17), !dbg !21
- ret i32 42
+ ret i32 42, !dbg !21
}
+; DWARF-LABEL: DW_TAG_subprogram
+; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 56 )
+; 0x56 -> RBP
+; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f1")
+; DWARF: DW_TAG_formal_parameter
+; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 10 )
+; DW_OP_fbreg (0x91) 0x10
+; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input")
+
; CHECK-LABEL: f2:
; Just check that we are indeed aligning the stack and setting up a base pointer
; in RBX.
@@ -34,17 +65,24 @@ define i32 @f1(%struct.s* byval align 8 %input) !dbg !19 {
; CHECK: andq $-64, %rsp
; CHECK: subq $64, %rsp
; CHECK: movq %rsp, %rbx
-; The parameter should still be referenced through RBP though.
-; CHECK-NOT: DEBUG_VALUE: f:input <- [%RBX
-; CHECK: DEBUG_VALUE: f:input <- [%RBP+16]
define i32 @f2(%struct.s* byval align 8 %input) !dbg !22 {
%val = load i64, i64* @glob
%stackspace = alloca i32, i64 %val, align 64
store i32* %stackspace, i32** @ptr
call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !23, metadata !17), !dbg !24
- ret i32 42
+ ret i32 42, !dbg !24
}
+; "input" should still be referred to through RBP.
+; DWARF-LABEL: DW_TAG_subprogram
+; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 56 )
+; 0x56 -> RBP
+; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f2")
+; DWARF: DW_TAG_formal_parameter
+; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 10 )
+; DW_OP_fbreg (0x91) 0x10
+; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input")
+
declare void @llvm.dbg.declare(metadata, metadata, metadata)
!llvm.dbg.cu = !{!2}
@@ -52,7 +90,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, emissionKind: FullDebug)
!3 = !DIFile(filename: "dbg-baseptr.ll", directory: "/")
!4 = !DILocalVariable(name: "input", arg: 1, scope: !8, file: !3, line: 5, type: !9)
!5 = !{}
@@ -60,7 +98,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
!6 = !DISubroutineType(types: !7)
!7 = !{!10, !9}
-!8 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5)
+!8 = distinct !DISubprogram(name: "f0", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, unit: !2, variables: !5)
!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s", elements: !11)
!10 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
@@ -74,9 +112,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
!17 = !DIExpression()
!18 = !DILocation(line: 5, scope: !8)
-!19 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5)
+!19 = distinct !DISubprogram(name: "f1", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5)
!20 = !DILocalVariable(name: "input", arg: 1, scope: !19, file: !3, line: 5, type: !9)
!21 = !DILocation(line: 5, scope: !19)
-!22 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5)
+!22 = distinct !DISubprogram(name: "f2", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5)
!23 = !DILocalVariable(name: "input", arg: 1, scope: !22, file: !3, line: 5, type: !9)
!24 = !DILocation(line: 5, scope: !22)
diff --git a/test/CodeGen/X86/elf-associated.ll b/test/CodeGen/X86/elf-associated.ll
index 361cf66cce72..7d58c3437025 100644
--- a/test/CodeGen/X86/elf-associated.ll
+++ b/test/CodeGen/X86/elf-associated.ll
@@ -37,3 +37,8 @@
@l = global i32 1, section "ccc", !associated !5
!5 = !{i32* null}
; CHECK-DAG: .section ccc,"aw",@progbits
+
+; Null metadata.
+@m = global i32 1, section "ddd", !associated !6
+!6 = distinct !{null}
+; CHECK-DAG: .section ddd,"aw",@progbits
diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
index d68236e9d250..eb06eb75a4d7 100644
--- a/test/CodeGen/X86/fold-tied-op.ll
+++ b/test/CodeGen/X86/fold-tied-op.ll
@@ -6,9 +6,10 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386--netbsd"
; CHECK-LABEL: fn1
-; CHECK: addl {{.*#+}} 4-byte Folded Reload
-; CHECK: imull {{.*#+}} 4-byte Folded Reload
-; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: addl {{.*#+}} 4-byte Folded Reload
+; CHECK: xorl {{.*#+}} 4-byte Folded Reload
+; CHECK: xorl {{.*#+}} 4-byte Folded Reload
; CHECK: retl
%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll
index 98082ec611d4..6c6bc8bdc1d1 100644
--- a/test/CodeGen/X86/fp128-i128.ll
+++ b/test/CodeGen/X86/fp128-i128.ll
@@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
; CHECK-NEXT: andq %rdi, %rcx
; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: jmp foo # TAILCALL
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
index 4596b83f7bc2..b5507523a75a 100644
--- a/test/CodeGen/X86/haddsub-2.ll
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -933,14 +933,14 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
-; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
-; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll
index cea9ac26edbc..ec620b8ce877 100644
--- a/test/CodeGen/X86/known-signbits-vector.ll
+++ b/test/CodeGen/X86/known-signbits-vector.ll
@@ -137,3 +137,64 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
%6 = sitofp i64 %5 to float
ret float %6
}
+
+define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: signbits_sext_shuffle_sitofp:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxdq %xmm0, %xmm1
+; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-NEXT: vpmovsxdq %xmm0, %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-NEXT: vcvtdq2pd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: signbits_sext_shuffle_sitofp:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxdq %xmm0, %xmm1
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: vpmovsxdq %xmm0, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-NEXT: vcvtdq2pd %xmm0, %ymm0
+; X64-NEXT: retq
+ %1 = sext <4 x i32> %a0 to <4 x i64>
+ %2 = shufflevector <4 x i64> %1, <4 x i64>%a1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %3 = sitofp <4 x i64> %2 to <4 x double>
+ ret <4 x double> %3
+}
+
+define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
+; X32: # BB#0:
+; X32-NEXT: vpsrad $16, %xmm0, %xmm1
+; X32-NEXT: vpsrlq $16, %xmm0, %xmm0
+; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X32-NEXT: vpsrlq $16, %xmm0, %xmm0
+; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT: vcvtdq2pd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
+; X64: # BB#0:
+; X64-NEXT: vpsrad $16, %xmm0, %xmm1
+; X64-NEXT: vpsrlq $16, %xmm0, %xmm0
+; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X64-NEXT: vpsrlq $16, %xmm0, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: vcvtdq2pd %xmm0, %xmm0
+; X64-NEXT: retq
+ %1 = ashr <2 x i64> %a0, <i64 16, i64 16>
+ %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %3 = shufflevector <4 x i64> %a1, <4 x i64> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %4 = ashr <4 x i64> %3, <i64 16, i64 16, i64 16, i64 16>
+ %5 = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %6 = sitofp <2 x i64> %5 to <2 x double>
+ ret <2 x double> %6
+}
diff --git a/test/CodeGen/X86/leaFixup32.mir b/test/CodeGen/X86/leaFixup32.mir
new file mode 100644
index 000000000000..70aac21c7ff2
--- /dev/null
+++ b/test/CodeGen/X86/leaFixup32.mir
@@ -0,0 +1,508 @@
+# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s
+--- |
+ ; ModuleID = 'test/CodeGen/X86/fixup-lea.ll'
+ source_filename = "test/CodeGen/X86/fixup-lea.ll"
+ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+ target triple = "i386"
+ ;generated using: llc -stop-after x86-pad-short-functions fixup-lea.ll > leaFinxup32.mir
+
+ ;test2add_32: 3 operands LEA32r that can be replaced with 2 add instructions
+ ; where ADD32ri8 is chosen
+ define i32 @test2add_32() {
+ ret i32 0
+ }
+
+ ;test2add_ebp_32: 3 operands LEA32r that can be replaced with 2 add instructions
+ ; where the base is rbp/r13/ebp register
+ define i32 @test2add_ebp_32() {
+ ret i32 0
+ }
+
+ ;test1add_ebp_32: 2 operands LEA32r where base register is ebp and can be replaced
+ ; with an add instruction
+ define i32 @test1add_ebp_32() {
+ ret i32 0
+ }
+
+ ;testleaadd_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+ define i32 @testleaadd_32() {
+ ret i32 0
+ }
+
+ ;testleaadd_ebp_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+ ; where the base is ebp register
+ define i32 @testleaadd_ebp_32() {
+ ret i32 0
+ }
+
+ ;test1lea_ebp_32: 2 operands LEA32r wher base register is rbp/r13/ebp and can be replaced
+ ; with a lea instruction
+ define i32 @test1lea_ebp_32() {
+ ret i32 0
+ }
+
+ ;test2addi32_32: 3 operands LEA32r that can be replaced with 2 add instructions where ADD32ri32
+ ; is chosen
+ define i32 @test2addi32_32() {
+ ret i32 0
+ }
+
+ ;test1mov1add_ebp_32: 2 operands LEA32r that can be replaced with 1 add 1 mov instructions
+ ; where the base is rbp/r13/ebp register
+ define i32 @test1mov1add_ebp_32() {
+ ret i32 0
+ }
+
+ ;testleaadd_ebp_index_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+ ; where the base and the index are ebp register and there is offset
+ define i32 @testleaadd_ebp_index_32() {
+ ret i32 0
+ }
+
+ ;testleaadd_ebp_index2_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
+ ; where the base and the index are ebp register and there is scale
+ define i32 @testleaadd_ebp_index2_32() {
+ ret i32 0
+ }
+
+ ;test_skip_opt_32: 3 operands LEA32r that can not be replaced with 2 instructions
+ define i32 @test_skip_opt_32() {
+ ret i32 0
+ }
+
+ ;test_skip_eflags_32: LEA32r that cannot be replaced since its not safe to clobber eflags
+ define i32 @test_skip_eflags_32() {
+ ret i32 0
+ }
+
+...
+---
+name: test2add_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%eax' }
+ - { reg: '%ebp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp
+ ; CHECK: %eax = ADD32rr %eax, killed %ebp
+ ; CHECK: %eax = ADD32ri8 %eax, -5
+
+ %eax = LEA32r killed %eax, 1, killed %ebp, -5, _
+ RETQ %eax
+
+...
+---
+name: test2add_ebp_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%eax' }
+ - { reg: '%ebp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp
+ ; CHECK: %ebp = ADD32rr %ebp, killed %eax
+ ; CHECK: %ebp = ADD32ri8 %ebp, -5
+
+ %ebp = LEA32r killed %ebp, 1, killed %eax, -5, _
+ RETQ %ebp
+
+...
+---
+name: test1add_ebp_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%eax' }
+ - { reg: '%ebp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp
+ ; CHECK: %ebp = ADD32rr %ebp, killed %eax
+
+ %ebp = LEA32r killed %ebp, 1, killed %eax, 0, _
+ RETQ %ebp
+
+...
+---
+name: testleaadd_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%eax' }
+ - { reg: '%ebp' }
+ - { reg: '%ebx' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp, %esi
+ ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0
+ ; CHECK: %ebx = ADD32ri8 %ebx, -5
+
+ %ebx = LEA32r killed %eax, 1, killed %ebp, -5, _
+ RETQ %ebx
+
+...
+---
+name: testleaadd_ebp_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%eax' }
+ - { reg: '%ebp' }
+ - { reg: '%ebx' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp
+ ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
+ ; CHECK: %ebx = ADD32ri8 %ebx, -5
+
+ %ebx = LEA32r killed %ebp, 1, killed %eax, -5, _
+ RETQ %ebx
+
+...
+---
+name: test1lea_ebp_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%eax' }
+ - { reg: '%ebp' }
+ - { reg: '%ebx' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp
+ ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
+
+ %ebx = LEA32r killed %ebp, 1, killed %eax, 0, _
+ RETQ %ebx
+
+...
+---
+name: test2addi32_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%eax' }
+ - { reg: '%ebp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp
+ ; CHECK: %eax = ADD32rr %eax, killed %ebp
+ ; CHECK: %eax = ADD32ri %eax, 129
+
+ %eax = LEA32r killed %eax, 1, killed %ebp, 129, _
+ RETQ %eax
+
+...
+---
+name: test1mov1add_ebp_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%eax' }
+ - { reg: '%ebp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp, %ebx
+ ; CHECK: %ebx = MOV32rr killed %ebp
+ ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
+
+ %ebx = LEA32r killed %ebp, 1, killed %ebp, 0, _
+ RETQ %ebx
+
+...
+---
+name: testleaadd_ebp_index_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%ebx' }
+ - { reg: '%ebp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp, %ebx
+ ; CHECK: %ebx = LEA32r _, 1, killed %ebp, 5, _
+ ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
+
+ %ebx = LEA32r killed %ebp, 1, killed %ebp, 5, _
+ RETQ %ebx
+
+...
+---
+name: testleaadd_ebp_index2_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%ebx' }
+ - { reg: '%ebp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp, %ebx
+ ; CHECK: %ebx = LEA32r _, 4, killed %ebp, 5, _
+ ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
+
+ %ebx = LEA32r killed %ebp, 4, killed %ebp, 5, _
+ RETQ %ebx
+
+...
+---
+name: test_skip_opt_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%ebx' }
+ - { reg: '%ebp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp, %ebx
+ ; CHECK: %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
+
+ %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
+ RETQ %ebp
+
+...
+---
+name: test_skip_eflags_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%ebp' }
+ - { reg: '%eax' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp, %ebx
+ ; CHECK: %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
+ ; CHECK: %ebp = LEA32r killed %ebx, 4, killed %ebx, 0, _
+ ; CHECK: %ebp = ADD32ri8 %ebp, 5
+
+ CMP32rr %eax, killed %ebx, implicit-def %eflags
+ %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
+ JE_1 %bb.1, implicit %eflags
+ RETQ %ebx
+ bb.1:
+ liveins: %eax, %ebp, %ebx
+ %ebp = LEA32r killed %ebx, 4, killed %ebx, 5, _
+ RETQ %ebp
+
+...
+
+
+
diff --git a/test/CodeGen/X86/leaFixup64.mir b/test/CodeGen/X86/leaFixup64.mir
new file mode 100644
index 000000000000..9b0058750598
--- /dev/null
+++ b/test/CodeGen/X86/leaFixup64.mir
@@ -0,0 +1,1041 @@
+# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s
+--- |
+ ; ModuleID = 'lea-2.ll'
+ source_filename = "lea-2.ll"
+ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+ ;generated using: llc -stop-after x86-pad-short-functions lea-2.ll > leaFinxup64.mir
+
+ ;testleaadd_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions
+ ; but can be replaced with 1 lea + 1 add
+ define i32 @testleaadd_64_32_1() {
+ ret i32 0
+ }
+
+ ;testleaadd_rbp_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions
+ ; where the base is rbp/r13/ebp register but it can be replaced with 1 lea + 1 add
+ define i32 @testleaadd_rbp_64_32_1() {
+ ret i32 0
+ }
+
+ ;test1lea_rbp_64_32_1: 2 operands LEA64_32r where base register is rbp/r13/ebp and can not
+ ; be replaced with an add instruction but can be replaced with 1 lea instruction
+ define i32 @test1lea_rbp_64_32_1() {
+ ret i32 0
+ }
+
+ ;test2add_64: 3 operands LEA64r that can be replaced with 2 add instructions
+ define i32 @test2add_64() {
+ ret i32 0
+ }
+
+ ;test2add_rbp_64: 3 operands LEA64r that can be replaced with 2 add instructions
+ ; where the base is rbp/r13/ebp register
+ define i32 @test2add_rbp_64() {
+ ret i32 0
+ }
+
+ ;test1add_rbp_64: 2 operands LEA64r where base register is rbp/r13/ebp and can be replaced
+ ; with an add instruction
+ define i32 @test1add_rbp_64() {
+ ret i32 0
+ }
+
+ ;testleaadd_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions
+ define i32 @testleaadd_64_32() {
+ ret i32 0
+ }
+
+ ;testleaadd_rbp_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions
+ ; where the base is rbp/r13/ebp register
+ define i32 @testleaadd_rbp_64_32() {
+ ret i32 0
+ }
+
+ ;test1lea_rbp_64_32: 2 operands LEA64_32r where base register is rbp/r13/ebp and can be replaced
+ ; with a lea instruction
+ define i32 @test1lea_rbp_64_32() {
+ ret i32 0
+ }
+
+ ;testleaadd_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+ define i32 @testleaadd_64() {
+ ret i32 0
+ }
+
+ ;testleaadd_rbp_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+ ; where the base is rbp/r13/ebp register
+ define i32 @testleaadd_rbp_64() {
+ ret i32 0
+ }
+
+ ;test1lea_rbp_64: 2 operands LEA64r wher base register is rbp/r13/ebp and can be replaced
+ ; with a lea instruction
+ define i32 @test1lea_rbp_64() {
+ ret i32 0
+ }
+
+ ;test8: dst = base & scale!=1, can't optimize
+ define i32 @test8() {
+ ret i32 0
+ }
+
+ ;testleaaddi32_64_32: 3 operands LEA64_32r that can be replaced with 1 lea + 1 add instructions where
+ ; ADD64ri32 is chosen
+ define i32 @testleaaddi32_64_32() {
+ ret i32 0
+ }
+
+ ;test1mov1add_rbp_64_32: 2 operands LEA64_32r cannot be replaced with 1 add 1 mov instructions
+ ; where the base is rbp/r13/ebp register
+ define i32 @test1mov1add_rbp_64_32() {
+ ret i32 0
+ }
+
+ ;testleaadd_rbp_index_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions
+ ; where the base and the index are ebp register and there is offset
+ define i32 @testleaadd_rbp_index_64_32() {
+ ret i32 0
+ }
+
+ ;testleaadd_rbp_index2_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions
+ ; where the base and the index are ebp register and there is scale
+ define i32 @testleaadd_rbp_index2_64_32() {
+ ret i32 0
+ }
+
+ ;test2addi32_64: 3 operands LEA64r that can be replaced with 2 add instructions where ADD64ri32
+ ; is chosen
+ define i32 @test2addi32_64() {
+ ret i32 0
+ }
+
+ ;test1mov1add_rbp_64: 2 operands LEA64r that can be replaced with 1 add 1 mov instructions
+ ; where the base is rbp/r13/ebp register
+ define i32 @test1mov1add_rbp_64() {
+ ret i32 0
+ }
+
+ ;testleaadd_rbp_index_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+ ; where the base and the index are ebp register and there is offset
+ define i32 @testleaadd_rbp_index_64() {
+ ret i32 0
+ }
+
+ ;testleaadd_rbp_index2_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
+ ; where the base and the index are ebp register and there is scale
+ define i32 @testleaadd_rbp_index2_64() {
+ ret i32 0
+ }
+
+ ;test_skip_opt_64: 3 operands LEA64r that can not be replaced with 2 instructions
+ define i32 @test_skip_opt_64() {
+ ret i32 0
+ }
+
+ ;test_skip_eflags_64: LEA64r that cannot be replaced since its not safe to clobber eflags
+ define i32 @test_skip_eflags_64() {
+ ret i32 0
+ }
+
+ ;test_skip_opt_64_32: 3 operands LEA64_32r that can not be replaced with 2 instructions
+ define i32 @test_skip_opt_64_32() {
+ ret i32 0
+ }
+
+ ;test_skip_eflags_64_32: LEA64_32r that cannot be replaced since its not safe to clobber eflags
+ define i32 @test_skip_eflags_64_32() {
+ ret i32 0
+ }
+
+
+...
+---
+name: testleaadd_64_32_1
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
+ ; CHECK: %eax = ADD32ri8 %eax, -5
+
+ %eax = LEA64_32r killed %rax, 1, killed %rbp, -5, _
+ RETQ %eax
+
+...
+---
+name: testleaadd_rbp_64_32_1
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0
+ ; CHECK: %ebp = ADD32ri8 %ebp, -5
+
+ %ebp = LEA64_32r killed %rbp, 1, killed %rax, -5, _
+ RETQ %ebp
+
+...
+---
+name: test1lea_rbp_64_32_1
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0
+
+ %ebp = LEA64_32r killed %rbp, 1, killed %rax, 0, _
+ RETQ %ebp
+
+...
+---
+name: test2add_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %rax = ADD64rr %rax, killed %rbp
+ ; CHECK: %rax = ADD64ri8 %rax, -5
+
+ %rax = LEA64r killed %rax, 1, killed %rbp, -5, _
+ RETQ %eax
+
+...
+---
+name: test2add_rbp_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %rbp = ADD64rr %rbp, killed %rax
+ ; CHECK: %rbp = ADD64ri8 %rbp, -5
+
+ %rbp = LEA64r killed %rbp, 1, killed %rax, -5, _
+ RETQ %ebp
+
+...
+---
+name: test1add_rbp_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %rbp = ADD64rr %rbp, killed %rax
+
+ %rbp = LEA64r killed %rbp, 1, killed %rax, 0, _
+ RETQ %ebp
+
+...
+---
+name: testleaadd_64_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+ - { reg: '%rbx' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+ ; CHECK: %ebx = ADD32ri8 %ebx, -5
+
+ %ebx = LEA64_32r killed %rax, 1, killed %rbp, -5, _
+ RETQ %ebx
+
+...
+---
+name: testleaadd_rbp_64_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+ - { reg: '%rbx' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+ ; CHECK: %ebx = ADD32ri8 %ebx, -5
+
+ %ebx = LEA64_32r killed %rbp, 1, killed %rax, -5, _
+ RETQ %ebx
+
+...
+---
+name: test1lea_rbp_64_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+ - { reg: '%rbx' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+
+ %ebx = LEA64_32r killed %rbp, 1, killed %rax, 0, _
+ RETQ %ebx
+
+...
+---
+name: testleaadd_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+ - { reg: '%rbx' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+ ; CHECK: %rbx = ADD64ri8 %rbx, -5
+
+ %rbx = LEA64r killed %rax, 1, killed %rbp, -5, _
+ RETQ %ebx
+
+...
+---
+name: testleaadd_rbp_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+ - { reg: '%rbx' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+ ; CHECK: %rbx = ADD64ri8 %rbx, -5
+
+ %rbx = LEA64r killed %rbp, 1, killed %rax, -5, _
+ RETQ %ebx
+
+...
+---
+name: test1lea_rbp_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+ - { reg: '%rbx' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+
+ %rbx = LEA64r killed %rbp, 1, killed %rax, 0, _
+ RETQ %ebx
+
+...
+---
+name: test8
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rdi, %rbp
+ ; CHECK: %r12 = LEA64r _, 2, killed %r13, 5, _
+ ; CHECK: %r12 = ADD64rr %r12, killed %rbp
+ %rbp = KILL %rbp, implicit-def %rbp
+ %r13 = KILL %rdi, implicit-def %r13
+ %r12 = LEA64r killed %rbp, 2, killed %r13, 5, _
+ RETQ %r12
+
+...
+---
+name: testleaaddi32_64_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
+ ; CHECK: %eax = ADD32ri %eax, 129
+
+ %eax = LEA64_32r killed %rax, 1, killed %rbp, 129, _
+ RETQ %eax
+
+...
+---
+name: test1mov1add_rbp_64_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp, %rbx
+ ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
+
+ %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
+ RETQ %ebx
+
+...
+---
+name: testleaadd_rbp_index_64_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rbx' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp, %rbx
+ ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
+
+ %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
+ RETQ %ebx
+
+...
+---
+name: testleaadd_rbp_index2_64_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rbx' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %eax, %ebp, %ebx
+ ; CHECK: %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
+
+ %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
+ RETQ %ebx
+
+...
+---
+name: test2addi32_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp
+ ; CHECK: %rax = ADD64rr %rax, killed %rbp
+ ; CHECK: %rax = ADD64ri32 %rax, 129
+
+ %rax = LEA64r killed %rax, 1, killed %rbp, 129, _
+ RETQ %eax
+
+...
+---
+name: test1mov1add_rbp_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rax' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp, %rbx
+ ; CHECK: %rbx = MOV64rr killed %rbp
+ ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
+
+ %rbx = LEA64r killed %rbp, 1, killed %rbp, 0, _
+ RETQ %ebx
+
+...
+---
+name: testleaadd_rbp_index_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rbx' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp, %rbx
+ ; CHECK: %rbx = LEA64r _, 1, killed %rbp, 5, _
+ ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
+
+ %rbx = LEA64r killed %rbp, 1, killed %rbp, 5, _
+ RETQ %ebx
+
+...
+---
+name: testleaadd_rbp_index2_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rbx' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp, %rbx
+ ; CHECK: %rbx = LEA64r _, 4, killed %rbp, 5, _
+ ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
+
+ %rbx = LEA64r killed %rbp, 4, killed %rbp, 5, _
+ RETQ %ebx
+
+...
+---
+name: test_skip_opt_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rbx' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp, %rbx
+ ; CHECK: %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
+
+ %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
+ RETQ %ebp
+
+...
+---
+name: test_skip_eflags_64
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rbp' }
+ - { reg: '%rax' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp, %rbx
+ ; CHECK: %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
+ ; CHECK: %rbp = LEA64r killed %rbx, 4, killed %rbx, 0, _
+ ; CHECK: %rbp = ADD64ri8 %rbp, 5
+
+ CMP64rr %rax, killed %rbx, implicit-def %eflags
+ %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
+ JE_1 %bb.1, implicit %eflags
+ RETQ %ebx
+ bb.1:
+ liveins: %rax, %rbp, %rbx
+ %rbp = LEA64r killed %rbx, 4, killed %rbx, 5, _
+ RETQ %ebp
+
+...
+---
+name: test_skip_opt_64_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rbx' }
+ - { reg: '%rbp' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp, %rbx
+ ; CHECK: %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
+
+ %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
+ RETQ %ebp
+
+...
+---
+name: test_skip_eflags_64_32
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rbp' }
+ - { reg: '%rax' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %rax, %rbp, %rbx
+ ; CHECK: %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
+ ; CHECK: %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 0, _
+ ; CHECK: %ebp = ADD32ri8 %ebp, 5
+
+ CMP64rr %rax, killed %rbx, implicit-def %eflags
+ %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
+ JE_1 %bb.1, implicit %eflags
+ RETQ %ebx
+ bb.1:
+ liveins: %rax, %rbp, %rbx
+ %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 5, _
+ RETQ %ebp
+
+...
+
+
+
diff --git a/test/CodeGen/X86/lrshrink.ll b/test/CodeGen/X86/lrshrink.ll
new file mode 100644
index 000000000000..a9cf086dbd90
--- /dev/null
+++ b/test/CodeGen/X86/lrshrink.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call
+; to minimize live-range.
+
+define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) {
+entry:
+ br i1 %a, label %then, label %else
+
+then:
+ br label %else
+
+else:
+ %0 = phi i64 [ 4, %entry ], [ 10, %then ]
+ %r = phi i64 [ %r1, %entry ], [ %r2, %then ]
+ %s = phi i64 [ %s1, %entry ], [ %s2, %then ]
+ %t = phi i64 [ %t1, %entry ], [ %t2, %then ]
+; CHECK-LABEL: test:
+; CHECK: add
+; CHECK: add
+; CHECK: call
+; CHECK: add
+; CHECK: call
+; CHECK: add
+; CHECK: call
+; CHECK: add
+ %1 = tail call i32 @_Z3foov()
+ %2 = zext i32 %1 to i64
+ %3 = tail call i32 @_Z3foov()
+ %4 = zext i32 %3 to i64
+ %5 = tail call i32 @_Z3foov()
+ %6 = zext i32 %5 to i64
+ %7 = add nuw nsw i64 %0, %r
+ tail call void @llvm.dbg.value(metadata i64 %7, i64 0, metadata !5, metadata !DIExpression()), !dbg !6
+ %8 = add nuw nsw i64 %2, %7
+ %9 = add nuw nsw i64 %4, %8
+ %10 = add nuw nsw i64 %6, %9
+ %11 = add nuw nsw i64 %s, %t
+ tail call void @llvm.dbg.value(metadata i64 %11, i64 0, metadata !5, metadata !DIExpression()), !dbg !6
+ %12 = add nuw nsw i64 %10, %11
+ ret i64 %12
+}
+
+declare i32 @_Z3foov()
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!1, !2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, emissionKind: FullDebug)
+!1 = !{i32 2, !"Dwarf Version", i32 4}
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DIFile(filename: "a.c", directory: "./")
+!4 = distinct !DISubprogram(name: "test", scope: !3, unit: !0)
+!5 = !DILocalVariable(name: "x", scope: !4)
+!6 = !DILocation(line: 4, scope: !4)
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index d332b2f3169f..af86df510016 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -129,9 +129,9 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; SSE2-NEXT: pmullw %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: addq $16, %rsi
; SSE2-NEXT: addq $16, %rdi
; SSE2-NEXT: addq $-8, %rax
@@ -246,23 +246,23 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
; SSE2-NEXT: pmullw %xmm4, %xmm5
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; SSE2-NEXT: psrad $16, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psraw $8, %xmm6
-; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psraw $8, %xmm7
-; SSE2-NEXT: pmullw %xmm6, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; SSE2-NEXT: psrad $16, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm7
-; SSE2-NEXT: paddd %xmm7, %xmm2
-; SSE2-NEXT: paddd %xmm6, %xmm3
-; SSE2-NEXT: paddd %xmm5, %xmm1
; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: pmullw %xmm4, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm2
; SSE2-NEXT: addq $16, %rsi
; SSE2-NEXT: addq $16, %rdi
; SSE2-NEXT: addq $-16, %rax
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 29a662fb217e..c5de8dd96cbc 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -3,7 +3,7 @@
; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX
; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
-; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
+; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 71417694b0d4..2f7714e63886 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -270,9 +270,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
; SSE2: # BB#0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_4f32_f32_012u:
@@ -292,9 +292,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_012u:
@@ -321,9 +321,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
; SSE2: # BB#0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_4f32_f32_019u:
@@ -343,9 +343,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_019u:
diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll
index e62a1d04dad6..94bbe75702cb 100644
--- a/test/CodeGen/X86/misched-matrix.ll
+++ b/test/CodeGen/X86/misched-matrix.ll
@@ -17,9 +17,9 @@
;
; TOPDOWN-LABEL: %for.body
; TOPDOWN: movl %{{.*}}, (
-; TOPDOWN: imull {{[0-9]*}}(
+; TOPDOWN-NOT: imull {{[0-9]*}}(
; TOPDOWN: movl %{{.*}}, 4(
-; TOPDOWN: imull {{[0-9]*}}(
+; TOPDOWN-NOT: imull {{[0-9]*}}(
; TOPDOWN: movl %{{.*}}, 8(
; TOPDOWN: movl %{{.*}}, 12(
; TOPDOWN-LABEL: %for.end
diff --git a/test/CodeGen/X86/not-and-simplify.ll b/test/CodeGen/X86/not-and-simplify.ll
index dfce6c681500..83b2be83d552 100644
--- a/test/CodeGen/X86/not-and-simplify.ll
+++ b/test/CodeGen/X86/not-and-simplify.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefix=ALL --check-prefix=NO_BMI
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=ALL --check-prefix=BMI
@@ -11,13 +12,24 @@ define i32 @shrink_xor_constant1(i32 %x) {
; ALL-NEXT: xorl $1, %edi
; ALL-NEXT: movl %edi, %eax
; ALL-NEXT: retq
-;
%sh = lshr i32 %x, 31
%not = xor i32 %sh, -1
%and = and i32 %not, 1
ret i32 %and
}
+define <4 x i32> @shrink_xor_constant1_splat(<4 x i32> %x) {
+; ALL-LABEL: shrink_xor_constant1_splat:
+; ALL: # BB#0:
+; ALL-NEXT: psrld $31, %xmm0
+; ALL-NEXT: pandn {{.*}}(%rip), %xmm0
+; ALL-NEXT: retq
+ %sh = lshr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+ %not = xor <4 x i32> %sh, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %and = and <4 x i32> %not, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %and
+}
+
; Clear low bits via shift, set them with xor (not), then mask them off.
define i8 @shrink_xor_constant2(i8 %x) {
@@ -27,10 +39,22 @@ define i8 @shrink_xor_constant2(i8 %x) {
; ALL-NEXT: xorb $-32, %dil
; ALL-NEXT: movl %edi, %eax
; ALL-NEXT: retq
-;
%sh = shl i8 %x, 5
%not = xor i8 %sh, -1
%and = and i8 %not, 224 ; 0xE0
ret i8 %and
}
+define <16 x i8> @shrink_xor_constant2_splat(<16 x i8> %x) {
+; ALL-LABEL: shrink_xor_constant2_splat:
+; ALL: # BB#0:
+; ALL-NEXT: psllw $5, %xmm0
+; ALL-NEXT: pand {{.*}}(%rip), %xmm0
+; ALL-NEXT: pandn {{.*}}(%rip), %xmm0
+; ALL-NEXT: retq
+ %sh = shl <16 x i8> %x, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+ %not = xor <16 x i8> %sh, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %and = and <16 x i8> %not, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <16 x i8> %and
+}
+
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index d26cf02dd942..0bda41a30c69 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -746,9 +746,9 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; SSE2-LABEL: interleave_24i8_in:
; SSE2: # BB#0:
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -791,17 +791,17 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; SSE42: # BB#0:
; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5]
-; SSE42-NEXT: movdqa %xmm2, %xmm3
+; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE42-NEXT: movdqa %xmm0, %xmm2
+; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5]
+; SSE42-NEXT: movdqa %xmm1, %xmm3
; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero
-; SSE42-NEXT: por %xmm1, %xmm3
+; SSE42-NEXT: por %xmm2, %xmm3
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: por %xmm0, %xmm2
-; SSE42-NEXT: movq %xmm2, 16(%rdi)
+; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: por %xmm0, %xmm1
+; SSE42-NEXT: movq %xmm1, 16(%rdi)
; SSE42-NEXT: movdqu %xmm3, (%rdi)
; SSE42-NEXT: retq
;
@@ -809,16 +809,16 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; AVX: # BB#0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero
-; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, 16(%rdi)
-; AVX-NEXT: vmovdqu %xmm1, (%rdi)
+; AVX-NEXT: vmovdqu %xmm2, (%rdi)
; AVX-NEXT: retq
%s1 = load <8 x i8>, <8 x i8>* %q1, align 4
%s2 = load <8 x i8>, <8 x i8>* %q2, align 4
diff --git a/test/CodeGen/X86/packss.ll b/test/CodeGen/X86/packss.ll
index 5cd649bb3902..24db6ba9ca2f 100644
--- a/test/CodeGen/X86/packss.ll
+++ b/test/CodeGen/X86/packss.ll
@@ -26,18 +26,17 @@ define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind {
; X64-AVX1-LABEL: trunc_ashr_v4i64:
; X64-AVX1: # BB#0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
; X64-AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: trunc_ashr_v4i64:
; X64-AVX2: # BB#0:
-; X64-AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
-; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; X64-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 88cb7a6d5825..50a661fcca11 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1152,9 +1152,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT: pmuludq %xmm4, %xmm2
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pmuludq %xmm4, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -1166,9 +1166,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: pmuludq %xmm2, %xmm4
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: pmuludq %xmm3, %xmm0
-; SSE41-NEXT: pmuludq %xmm2, %xmm4
; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
; SSE41-NEXT: retq
;
@@ -1312,17 +1312,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT: movdqa %xmm2, %xmm8
-; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: movdqa %xmm2, %xmm7
; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pmuludq %xmm7, %xmm5
+; SSE2-NEXT: pmuludq %xmm7, %xmm4
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
; SSE2-NEXT: pmuludq %xmm0, %xmm2
-; SSE2-NEXT: pmuludq %xmm8, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3]
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSE2-NEXT: pmuludq %xmm0, %xmm5
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3]
; SSE2-NEXT: movaps %xmm4, %xmm0
; SSE2-NEXT: movaps %xmm5, %xmm1
@@ -1331,22 +1331,22 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; SSE41-LABEL: mul_v8i64_zero_upper:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT: pmuludq %xmm4, %xmm1
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT: pmuludq %xmm5, %xmm0
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT: pmuludq %xmm6, %xmm2
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
; SSE41-NEXT: pmuludq %xmm7, %xmm1
-; SSE41-NEXT: pmuludq %xmm6, %xmm2
-; SSE41-NEXT: pmuludq %xmm5, %xmm0
-; SSE41-NEXT: pmuludq %xmm8, %xmm4
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
; SSE41-NEXT: retq
;
@@ -1356,11 +1356,11 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1
-; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
@@ -1467,22 +1467,22 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE41-LABEL: mul_v8i64_sext:
; SSE41: # BB#0:
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm3, %xmm8
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwq %xmm3, %xmm6
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm3, %xmm7
-; SSE41-NEXT: pmovsxwq %xmm0, %xmm5
+; SSE41-NEXT: pmovsxwq %xmm3, %xmm6
+; SSE41-NEXT: pmovsxwq %xmm0, %xmm7
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE41-NEXT: pmovsxdq %xmm0, %xmm3
+; SSE41-NEXT: pmuldq %xmm4, %xmm3
; SSE41-NEXT: pmovsxdq %xmm2, %xmm2
+; SSE41-NEXT: pmuldq %xmm5, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE41-NEXT: pmovsxdq %xmm0, %xmm4
+; SSE41-NEXT: pmuldq %xmm6, %xmm4
; SSE41-NEXT: pmovsxdq %xmm1, %xmm0
-; SSE41-NEXT: pmuldq %xmm5, %xmm0
-; SSE41-NEXT: pmuldq %xmm7, %xmm4
-; SSE41-NEXT: pmuldq %xmm6, %xmm2
-; SSE41-NEXT: pmuldq %xmm8, %xmm3
+; SSE41-NEXT: pmuldq %xmm7, %xmm0
; SSE41-NEXT: movdqa %xmm4, %xmm1
; SSE41-NEXT: retq
;
@@ -1493,9 +1493,10 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1
+; AVX2-NEXT: vmovdqa %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: mul_v8i64_sext:
diff --git a/test/CodeGen/X86/pr28129.ll b/test/CodeGen/X86/pr28129.ll
index a155f71f79c3..15bffffa207f 100644
--- a/test/CodeGen/X86/pr28129.ll
+++ b/test/CodeGen/X86/pr28129.ll
@@ -5,15 +5,15 @@
define <4 x double> @cmp4f64_domain(<4 x double> %a) {
; X86-LABEL: cmp4f64_domain:
; X86: # BB#0:
-; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp4f64_domain:
; X64: # BB#0:
-; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%cmp = fcmp oeq <4 x double> zeroinitializer, zeroinitializer
@@ -26,15 +26,15 @@ define <4 x double> @cmp4f64_domain(<4 x double> %a) {
define <4 x double> @cmp4f64_domain_optsize(<4 x double> %a) optsize {
; X86-LABEL: cmp4f64_domain_optsize:
; X86: # BB#0:
-; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp4f64_domain_optsize:
; X64: # BB#0:
-; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%cmp = fcmp oeq <4 x double> zeroinitializer, zeroinitializer
@@ -47,15 +47,15 @@ define <4 x double> @cmp4f64_domain_optsize(<4 x double> %a) optsize {
define <8 x float> @cmp8f32_domain(<8 x float> %a) {
; X86-LABEL: cmp8f32_domain:
; X86: # BB#0:
-; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp8f32_domain:
; X64: # BB#0:
-; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%cmp = fcmp oeq <8 x float> zeroinitializer, zeroinitializer
@@ -68,15 +68,15 @@ define <8 x float> @cmp8f32_domain(<8 x float> %a) {
define <8 x float> @cmp8f32_domain_optsize(<8 x float> %a) optsize {
; X86-LABEL: cmp8f32_domain_optsize:
; X86: # BB#0:
-; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp8f32_domain_optsize:
; X64: # BB#0:
-; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%cmp = fcmp oeq <8 x float> zeroinitializer, zeroinitializer
diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll
index 8c970b3d4771..94904018872b 100644
--- a/test/CodeGen/X86/pr29112.ll
+++ b/test/CodeGen/X86/pr29112.ll
@@ -38,7 +38,8 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1]
+; CHECK-NEXT: vaddps %xmm14, %xmm1, %xmm10
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3]
@@ -52,10 +53,9 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: vmovaps %xmm15, %xmm1
; CHECK-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm9
-; CHECK-NEXT: vaddps %xmm14, %xmm10, %xmm0
; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm8
-; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm3
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm0
+; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovaps %xmm9, (%rsp)
diff --git a/test/CodeGen/X86/pr30562.ll b/test/CodeGen/X86/pr30562.ll
index dda736a1a183..a8e648074194 100644
--- a/test/CodeGen/X86/pr30562.ll
+++ b/test/CodeGen/X86/pr30562.ll
@@ -1,5 +1,6 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
define i32 @foo(i64* nocapture %perm, i32 %n) {
entry:
br label %body
diff --git a/test/CodeGen/X86/pr31088.ll b/test/CodeGen/X86/pr31088.ll
index 0dd8eb0ece85..d7a546c7396d 100644
--- a/test/CodeGen/X86/pr31088.ll
+++ b/test/CodeGen/X86/pr31088.ll
@@ -150,12 +150,12 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
; F16C-NEXT: vcvtph2ps %xmm3, %xmm3
; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1
; F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; F16C-NEXT: vcvtph2ps %xmm2, %xmm2
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1
; F16C-NEXT: retq
%retval = fadd <2 x half> %arg0, %arg1
ret <2 x half> %retval
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index e05fc926b080..143e3af82eb7 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -30,25 +30,24 @@ define void @foo() {
; X86-O0-NEXT: subl $12, %esp
; X86-O0-NEXT: .Lcfi0:
; X86-O0-NEXT: .cfi_def_cfa_offset 16
-; X86-O0-NEXT: movzbl c, %eax
-; X86-O0-NEXT: testl %eax, %eax
-; X86-O0-NEXT: setne %cl
-; X86-O0-NEXT: movl %eax, %edx
-; X86-O0-NEXT: movb %dl, %ch
-; X86-O0-NEXT: testb %ch, %ch
+; X86-O0-NEXT: movb c, %al
+; X86-O0-NEXT: testb %al, %al
; X86-O0-NEXT: setne {{[0-9]+}}(%esp)
-; X86-O0-NEXT: movzbl %cl, %edx
-; X86-O0-NEXT: subl %eax, %edx
-; X86-O0-NEXT: setle %cl
-; X86-O0-NEXT: # implicit-def: %EAX
-; X86-O0-NEXT: movb %cl, %al
-; X86-O0-NEXT: andl $1, %eax
-; X86-O0-NEXT: kmovd %eax, %k0
-; X86-O0-NEXT: kmovd %k0, %eax
+; X86-O0-NEXT: movzbl c, %ecx
+; X86-O0-NEXT: testl %ecx, %ecx
+; X86-O0-NEXT: setne %al
+; X86-O0-NEXT: movzbl %al, %edx
+; X86-O0-NEXT: subl %ecx, %edx
+; X86-O0-NEXT: setle %al
+; X86-O0-NEXT: # implicit-def: %ECX
; X86-O0-NEXT: movb %al, %cl
-; X86-O0-NEXT: andb $1, %cl
-; X86-O0-NEXT: movzbl %cl, %eax
-; X86-O0-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-O0-NEXT: andl $1, %ecx
+; X86-O0-NEXT: kmovd %ecx, %k0
+; X86-O0-NEXT: kmovd %k0, %ecx
+; X86-O0-NEXT: movb %cl, %al
+; X86-O0-NEXT: andb $1, %al
+; X86-O0-NEXT: movzbl %al, %ecx
+; X86-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-O0-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-O0-NEXT: addl $12, %esp
; X86-O0-NEXT: retl
@@ -69,27 +68,25 @@ define void @foo() {
;
; X64-O0-LABEL: foo:
; X64-O0: # BB#0: # %entry
-; X64-O0-NEXT: movzbl {{.*}}(%rip), %eax
-; X64-O0-NEXT: movl %eax, %ecx
-; X64-O0-NEXT: movb %cl, %dl
-; X64-O0-NEXT: movl %ecx, %eax
-; X64-O0-NEXT: testq %rcx, %rcx
-; X64-O0-NEXT: setne %sil
-; X64-O0-NEXT: testb %dl, %dl
+; X64-O0-NEXT: movb {{.*}}(%rip), %al
+; X64-O0-NEXT: testb %al, %al
; X64-O0-NEXT: setne -{{[0-9]+}}(%rsp)
-; X64-O0-NEXT: movzbl %sil, %edi
-; X64-O0-NEXT: subl %eax, %edi
-; X64-O0-NEXT: setle %dl
-; X64-O0-NEXT: # implicit-def: %EAX
-; X64-O0-NEXT: movb %dl, %al
-; X64-O0-NEXT: andl $1, %eax
-; X64-O0-NEXT: kmovd %eax, %k0
-; X64-O0-NEXT: kmovd %k0, %eax
-; X64-O0-NEXT: movb %al, %dl
-; X64-O0-NEXT: andb $1, %dl
-; X64-O0-NEXT: movzbl %dl, %eax
-; X64-O0-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; X64-O0-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-O0-NEXT: movzbl {{.*}}(%rip), %ecx
+; X64-O0-NEXT: testl %ecx, %ecx
+; X64-O0-NEXT: setne %al
+; X64-O0-NEXT: movzbl %al, %edx
+; X64-O0-NEXT: subl %ecx, %edx
+; X64-O0-NEXT: setle %al
+; X64-O0-NEXT: # implicit-def: %ECX
+; X64-O0-NEXT: movb %al, %cl
+; X64-O0-NEXT: andl $1, %ecx
+; X64-O0-NEXT: kmovd %ecx, %k0
+; X64-O0-NEXT: kmovd %k0, %ecx
+; X64-O0-NEXT: movb %cl, %al
+; X64-O0-NEXT: andb $1, %al
+; X64-O0-NEXT: movzbl %al, %ecx
+; X64-O0-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-O0-NEXT: movl %edx, -{{[0-9]+}}(%rsp) # 4-byte Spill
; X64-O0-NEXT: retq
entry:
%a = alloca i8, align 1
diff --git a/test/CodeGen/X86/pr32907.ll b/test/CodeGen/X86/pr32907.ll
index bc03fbe06843..8057b31c961c 100644
--- a/test/CodeGen/X86/pr32907.ll
+++ b/test/CodeGen/X86/pr32907.ll
@@ -5,41 +5,44 @@
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
define <2 x i64> @PR32907(<2 x i64> %astype.i, <2 x i64> %astype6.i) {
-; SSE-LABEL: PR32907:
-; SSE: # BB#0: # %entry
-; SSE-NEXT: psubq %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: psubq %xmm0, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: PR32907:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: psubq %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: PR32907:
+; SSE42: # BB#0: # %entry
+; SSE42-NEXT: psubq %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: pcmpgtq %xmm0, %xmm1
+; SSE42-NEXT: pxor %xmm1, %xmm0
+; SSE42-NEXT: psubq %xmm1, %xmm0
+; SSE42-NEXT: retq
;
; AVX2-LABEL: PR32907:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm2
-; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR32907:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpsubq %xmm0, %xmm2, %xmm2
-; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll b/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll
new file mode 100644
index 000000000000..9a5da33223ba
--- /dev/null
+++ b/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s -o /dev/null
+; pr33001 - Check that llc doesn't crash when running with O0 option.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <4 x i32> @test_masked_load(<4 x i32>* %base, <4 x i1> %mask) {
+ %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %base, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+
+
+define void @test_masked_store(<4 x i32>* %base, <4 x i32> %value, <4 x i1> %mask) {
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %base, i32 4, <4 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+
+
+define <4 x i32> @llvm_masked_gather(<4 x i32*> %ptrs, <4 x i1> %mask) {
+ %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> undef)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+
+
+define void @llvm_masked_scatter(<4 x i32*> %ptrs, <4 x i32> %value, <4 x i1> %mask) {
+ call void @llvm.masked.scatter.v4i32(<4 x i32> %value, <4 x i32*> %ptrs, i32 4, <4 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
+
diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll
index 5d5150ad62d6..4be3a4c2391b 100644
--- a/test/CodeGen/X86/rotate.ll
+++ b/test/CodeGen/X86/rotate.ll
@@ -33,8 +33,8 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind {
; 32-NEXT: movl %ebx, %esi
; 32-NEXT: xorl %ebx, %ebx
; 32-NEXT: .LBB0_4:
-; 32-NEXT: orl %esi, %eax
; 32-NEXT: orl %ebx, %edx
+; 32-NEXT: orl %esi, %eax
; 32-NEXT: popl %esi
; 32-NEXT: popl %edi
; 32-NEXT: popl %ebx
@@ -86,8 +86,8 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind {
; 32-NEXT: movl %ebx, %esi
; 32-NEXT: xorl %ebx, %ebx
; 32-NEXT: .LBB1_4:
-; 32-NEXT: orl %ebx, %eax
; 32-NEXT: orl %esi, %edx
+; 32-NEXT: orl %ebx, %eax
; 32-NEXT: popl %esi
; 32-NEXT: popl %edi
; 32-NEXT: popl %ebx
@@ -546,7 +546,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
; 32-LABEL: rotr1_64_mem:
; 32: # BB#0:
; 32-NEXT: pushl %esi
-; 32-NEXT: movl 8(%esp), %eax
+; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: movl (%eax), %ecx
; 32-NEXT: movl 4(%eax), %edx
; 32-NEXT: movl %edx, %esi
@@ -555,11 +555,13 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
; 32-NEXT: movl %ecx, 4(%eax)
; 32-NEXT: movl %esi, (%eax)
; 32-NEXT: popl %esi
-
+; 32-NEXT: retl
+;
; 64-LABEL: rotr1_64_mem:
; 64: # BB#0:
; 64-NEXT: rorq (%rdi)
; 64-NEXT: retq
+
%A = load i64, i64 *%Aptr
%B = shl i64 %A, 63
%C = lshr i64 %A, 1
@@ -571,7 +573,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
define void @rotr1_32_mem(i32* %Aptr) nounwind {
; 32-LABEL: rotr1_32_mem:
; 32: # BB#0:
-; 32-NEXT: movl 4(%esp), %eax
+; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorl (%eax)
; 32-NEXT: retl
;
@@ -590,7 +592,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind {
define void @rotr1_16_mem(i16* %Aptr) nounwind {
; 32-LABEL: rotr1_16_mem:
; 32: # BB#0:
-; 32-NEXT: movl 4(%esp), %eax
+; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorw (%eax)
; 32-NEXT: retl
;
@@ -609,7 +611,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind {
define void @rotr1_8_mem(i8* %Aptr) nounwind {
; 32-LABEL: rotr1_8_mem:
; 32: # BB#0:
-; 32-NEXT: movl 4(%esp), %eax
+; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorb (%eax)
; 32-NEXT: retl
;
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
index b8a8b8afd14f..6a565a5c76f0 100644
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@@ -149,127 +149,131 @@ middle.block:
define i32 @sad_32i8() nounwind {
; SSE2-LABEL: sad_32i8:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm12, %xmm12
-; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm14, %xmm14
+; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB1_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa a+1040(%rax), %xmm6
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa a+1040(%rax), %xmm8
; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm8
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15]
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
+; SSE2-NEXT: movdqa b+1024(%rax), %xmm11
+; SSE2-NEXT: movdqa %xmm11, %xmm10
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm10, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm2, %xmm7
; SSE2-NEXT: movdqa b+1040(%rax), %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
-; SSE2-NEXT: movdqa %xmm9, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
-; SSE2-NEXT: psubd %xmm9, %xmm6
-; SSE2-NEXT: movdqa b+1024(%rax), %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; SSE2-NEXT: psubd %xmm10, %xmm7
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm10, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
+; SSE2-NEXT: movdqa %xmm11, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
; SSE2-NEXT: psubd %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; SSE2-NEXT: psubd %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
-; SSE2-NEXT: psubd %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm8, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
-; SSE2-NEXT: psubd %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; SSE2-NEXT: psubd %xmm2, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
-; SSE2-NEXT: psubd %xmm4, %xmm10
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm10
-; SSE2-NEXT: pxor %xmm2, %xmm10
-; SSE2-NEXT: movdqa %xmm8, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm8
-; SSE2-NEXT: pxor %xmm2, %xmm8
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm7
-; SSE2-NEXT: pxor %xmm2, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm11, %xmm3
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
; SSE2-NEXT: movdqa %xmm6, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm2, %xmm6
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm6, %xmm14
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm6, %xmm0
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm6, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm9, %xmm8
+; SSE2-NEXT: movdqa %xmm7, %xmm6
+; SSE2-NEXT: psrad $31, %xmm6
+; SSE2-NEXT: paddd %xmm6, %xmm7
+; SSE2-NEXT: pxor %xmm6, %xmm7
; SSE2-NEXT: paddd %xmm7, %xmm13
-; SSE2-NEXT: paddd %xmm1, %xmm15
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: psrad $31, %xmm6
+; SSE2-NEXT: paddd %xmm6, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm10, %xmm6
+; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm0, %xmm12
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm5, %xmm2
-; SSE2-NEXT: paddd %xmm8, %xmm3
-; SSE2-NEXT: paddd %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: paddd %xmm5, %xmm14
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm15
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: paddd %xmm0, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm8
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm8, %xmm0
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB1_1
; SSE2-NEXT: # BB#2: # %middle.block
-; SSE2-NEXT: paddd %xmm15, %xmm3
-; SSE2-NEXT: paddd %xmm14, %xmm1
-; SSE2-NEXT: paddd %xmm12, %xmm0
-; SSE2-NEXT: paddd %xmm13, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: paddd %xmm15, %xmm6
+; SSE2-NEXT: paddd %xmm0, %xmm3
+; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: paddd %xmm14, %xmm13
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: paddd %xmm3, %xmm4
+; SSE2-NEXT: paddd %xmm13, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
@@ -398,288 +402,284 @@ middle.block:
define i32 @sad_avx64i8() nounwind {
; SSE2-LABEL: sad_avx64i8:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: subq $184, %rsp
-; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: subq $200, %rsp
+; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm14, %xmm14
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm11, %xmm11
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm11, %xmm11
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: pxor %xmm13, %xmm13
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm7, %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB2_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa a+1040(%rax), %xmm6
-; SSE2-NEXT: movdqa a+1024(%rax), %xmm4
-; SSE2-NEXT: movdqa a+1056(%rax), %xmm11
-; SSE2-NEXT: movdqa a+1072(%rax), %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm11, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3]
-; SSE2-NEXT: movdqa %xmm4, %xmm12
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm12, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm14
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm14, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15]
-; SSE2-NEXT: movdqa %xmm6, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
-; SSE2-NEXT: movdqa b+1040(%rax), %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm13
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15]
-; SSE2-NEXT: movdqa %xmm9, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm9, %xmm6
-; SSE2-NEXT: movdqa b+1024(%rax), %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm10, %xmm8
-; SSE2-NEXT: movdqa %xmm13, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm13, %xmm14
-; SSE2-NEXT: movdqa %xmm2, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm9, %xmm7
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm2, %xmm4
-; SSE2-NEXT: movdqa b+1056(%rax), %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm10, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm10, %xmm12
-; SSE2-NEXT: movdqa %xmm2, %xmm10
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: psubd %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm2, %xmm11
-; SSE2-NEXT: movdqa %xmm1, %xmm13
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm10
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm2, %xmm13
-; SSE2-NEXT: movdqa b+1072(%rax), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm2, %xmm3
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm9, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15]
+; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movaps a+1040(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa a+1024(%rax), %xmm12
+; SSE2-NEXT: movdqa a+1056(%rax), %xmm15
+; SSE2-NEXT: movdqa a+1072(%rax), %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
+; SSE2-NEXT: movdqa %xmm15, %xmm11
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm11, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm15, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT: psubd %xmm0, %xmm5
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
-; SSE2-NEXT: psubd %xmm2, %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm9
-; SSE2-NEXT: pxor %xmm0, %xmm9
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; SSE2-NEXT: movdqa %xmm12, %xmm10
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
; SSE2-NEXT: movdqa %xmm10, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm10
-; SSE2-NEXT: pxor %xmm0, %xmm10
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm13, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm13
-; SSE2-NEXT: pxor %xmm0, %xmm13
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm11, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm11
-; SSE2-NEXT: pxor %xmm0, %xmm11
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15]
; SSE2-NEXT: movdqa %xmm12, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm12
-; SSE2-NEXT: pxor %xmm0, %xmm12
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm13
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; SSE2-NEXT: movdqa b+1072(%rax), %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm0, %xmm1
+; SSE2-NEXT: movdqa b+1056(%rax), %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm7, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm7, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm7, %xmm8
+; SSE2-NEXT: movdqa b+1024(%rax), %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm3, %xmm11
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm0, %xmm15
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm3, %xmm9
+; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm0, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm0, %xmm13
+; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm9, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm7, %xmm12
+; SSE2-NEXT: movdqa b+1040(%rax), %xmm13
+; SSE2-NEXT: movdqa %xmm13, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm7, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm3, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15]
+; SSE2-NEXT: movdqa %xmm13, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; SSE2-NEXT: psubd %xmm3, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
+; SSE2-NEXT: psubd %xmm13, %xmm2
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm8
+; SSE2-NEXT: pxor %xmm1, %xmm8
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm11, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm11
+; SSE2-NEXT: pxor %xmm1, %xmm11
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm11, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm11
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm15, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm15
+; SSE2-NEXT: pxor %xmm1, %xmm15
+; SSE2-NEXT: paddd %xmm15, %xmm2
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm15
+; SSE2-NEXT: movdqa %xmm10, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm10
+; SSE2-NEXT: pxor %xmm1, %xmm10
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm10, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm6
+; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: movdqa %xmm12, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm12
+; SSE2-NEXT: pxor %xmm1, %xmm12
+; SSE2-NEXT: paddd %xmm12, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm13
+; SSE2-NEXT: movdqa %xmm9, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm4
+; SSE2-NEXT: paddd %xmm0, %xmm9
+; SSE2-NEXT: pxor %xmm0, %xmm9
+; SSE2-NEXT: paddd %xmm9, %xmm1
; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm7
; SSE2-NEXT: pxor %xmm0, %xmm7
-; SSE2-NEXT: movdqa %xmm14, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm14
-; SSE2-NEXT: pxor %xmm0, %xmm14
-; SSE2-NEXT: movdqa %xmm8, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm8
-; SSE2-NEXT: pxor %xmm0, %xmm8
-; SSE2-NEXT: movdqa %xmm6, %xmm0
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm0, %xmm6
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm8, %xmm6
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm14, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm7, %xmm2
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
-; SSE2-NEXT: paddd %xmm12, %xmm8
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa %xmm0, %xmm12
-; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm11, %xmm0
-; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm7
+; SSE2-NEXT: pxor %xmm0, %xmm7
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT: paddd %xmm1, %xmm2
-; SSE2-NEXT: paddd %xmm13, %xmm7
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm10, %xmm1
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm5, %xmm3
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm9, %xmm5
-; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm7, %xmm0
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB2_1
; SSE2-NEXT: # BB#2: # %middle.block
-; SSE2-NEXT: paddd %xmm2, %xmm4
-; SSE2-NEXT: paddd %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm12, %xmm2
-; SSE2-NEXT: paddd %xmm11, %xmm2
-; SSE2-NEXT: paddd %xmm13, %xmm14
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm7, %xmm3
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm5, %xmm7
-; SSE2-NEXT: paddd %xmm0, %xmm8
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm3, %xmm8
+; SSE2-NEXT: paddd %xmm2, %xmm15
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm8, %xmm13
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm5, %xmm0
+; SSE2-NEXT: paddd %xmm11, %xmm10
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm10, %xmm1
+; SSE2-NEXT: paddd %xmm13, %xmm1
+; SSE2-NEXT: paddd %xmm15, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: paddd %xmm3, %xmm7
-; SSE2-NEXT: paddd %xmm4, %xmm6
-; SSE2-NEXT: paddd %xmm14, %xmm6
-; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: paddd %xmm8, %xmm7
-; SSE2-NEXT: paddd %xmm6, %xmm7
-; SSE2-NEXT: paddd %xmm2, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1]
-; SSE2-NEXT: paddd %xmm7, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: addq $184, %rsp
+; SSE2-NEXT: addq $200, %rsp
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_avx64i8:
@@ -688,8 +688,8 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6
; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5
; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7
@@ -697,7 +697,6 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: .LBB2_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -705,48 +704,49 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8
+; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14
+; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8
+; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13
+; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12
+; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10
+; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9
-; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15
-; AVX2-NEXT: vpabsd %ymm8, %ymm8
+; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15
+; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7
+; AVX2-NEXT: vpabsd %ymm9, %ymm8
+; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT: vpabsd %ymm10, %ymm8
+; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6
+; AVX2-NEXT: vpabsd %ymm11, %ymm8
; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3
-; AVX2-NEXT: vpabsd %ymm14, %ymm8
-; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1
-; AVX2-NEXT: vpabsd %ymm13, %ymm8
-; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2
; AVX2-NEXT: vpabsd %ymm12, %ymm8
; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0
-; AVX2-NEXT: vpabsd %ymm11, %ymm8
-; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4
-; AVX2-NEXT: vpabsd %ymm10, %ymm8
-; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6
-; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT: vpabsd %ymm13, %ymm8
+; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2
+; AVX2-NEXT: vpabsd %ymm14, %ymm8
+; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1
; AVX2-NEXT: vpabsd %ymm15, %ymm8
-; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7
+; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB2_1
; AVX2-NEXT: # BB#2: # %middle.block
; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4
+; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -773,21 +773,21 @@ define i32 @sad_avx64i8() nounwind {
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
-; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7
-; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6
-; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5
; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7
; AVX512F-NEXT: vpabsd %zmm4, %zmm4
-; AVX512F-NEXT: vpabsd %zmm5, %zmm5
-; AVX512F-NEXT: vpabsd %zmm6, %zmm6
-; AVX512F-NEXT: vpabsd %zmm7, %zmm7
-; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3
-; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2
-; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1
; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT: vpabsd %zmm5, %zmm4
+; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1
+; AVX512F-NEXT: vpabsd %zmm6, %zmm4
+; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2
+; AVX512F-NEXT: vpabsd %zmm7, %zmm4
+; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB2_1
; AVX512F-NEXT: # BB#2: # %middle.block
@@ -1154,59 +1154,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; SSE2-LABEL: sad_nonloop_32i8:
; SSE2: # BB#0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movdqu 16(%rdi), %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm12
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm12, %xmm9
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm13
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm13, %xmm10
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm11
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: movdqu (%rdx), %xmm5
-; SSE2-NEXT: movdqu 16(%rdx), %xmm7
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm14
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm7, %xmm15
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm8
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT: psubd %xmm5, %xmm0
-; SSE2-NEXT: psubd %xmm7, %xmm3
-; SSE2-NEXT: psubd %xmm2, %xmm13
-; SSE2-NEXT: psubd %xmm1, %xmm12
-; SSE2-NEXT: psubd %xmm8, %xmm6
-; SSE2-NEXT: psubd %xmm15, %xmm11
-; SSE2-NEXT: psubd %xmm14, %xmm10
-; SSE2-NEXT: psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm1, %xmm9
+; SSE2-NEXT: movdqu 16(%rdi), %xmm12
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm12, %xmm8
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm8, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
+; SSE2-NEXT: movdqa %xmm12, %xmm13
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: movdqu (%rdx), %xmm7
+; SSE2-NEXT: movdqu 16(%rdx), %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: psubd %xmm5, %xmm10
+; SSE2-NEXT: movdqa %xmm7, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: psubd %xmm5, %xmm11
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: psubd %xmm5, %xmm13
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; SSE2-NEXT: psubd %xmm6, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psubd %xmm2, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT: psubd %xmm3, %xmm12
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; SSE2-NEXT: psubd %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm10, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm10
@@ -1215,33 +1210,37 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm11
; SSE2-NEXT: pxor %xmm1, %xmm11
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm12, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm12
-; SSE2-NEXT: pxor %xmm1, %xmm12
; SSE2-NEXT: movdqa %xmm13, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm13
; SSE2-NEXT: pxor %xmm1, %xmm13
-; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: paddd %xmm13, %xmm4
+; SSE2-NEXT: paddd %xmm10, %xmm4
+; SSE2-NEXT: paddd %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm8
+; SSE2-NEXT: pxor %xmm1, %xmm8
+; SSE2-NEXT: movdqa %xmm9, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm9
+; SSE2-NEXT: pxor %xmm1, %xmm9
+; SSE2-NEXT: movdqa %xmm12, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm12
+; SSE2-NEXT: pxor %xmm1, %xmm12
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: paddd %xmm3, %xmm0
-; SSE2-NEXT: paddd %xmm11, %xmm6
-; SSE2-NEXT: paddd %xmm9, %xmm6
-; SSE2-NEXT: paddd %xmm10, %xmm6
; SSE2-NEXT: paddd %xmm12, %xmm0
-; SSE2-NEXT: paddd %xmm6, %xmm0
-; SSE2-NEXT: paddd %xmm13, %xmm0
+; SSE2-NEXT: paddd %xmm8, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm9, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index ce42d0d643e8..1afef86a5f11 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -299,20 +299,21 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; GENERIC-NEXT: testb %dil, %dil
; GENERIC-NEXT: jne LBB7_4
; GENERIC-NEXT: ## BB#5:
+; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; GENERIC-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; GENERIC-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; GENERIC-NEXT: jmp LBB7_6
; GENERIC-NEXT: LBB7_4:
-; GENERIC-NEXT: movd %r9d, %xmm2
-; GENERIC-NEXT: movd %ecx, %xmm3
-; GENERIC-NEXT: movd %r8d, %xmm4
+; GENERIC-NEXT: movd %r9d, %xmm1
+; GENERIC-NEXT: movd %ecx, %xmm2
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; GENERIC-NEXT: movd %r8d, %xmm3
; GENERIC-NEXT: movd %edx, %xmm1
; GENERIC-NEXT: LBB7_6:
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1
; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0
; GENERIC-NEXT: movq %xmm0, 16(%rsi)
@@ -339,16 +340,19 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; ATOM-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; ATOM-NEXT: jmp LBB7_6
; ATOM-NEXT: LBB7_4:
-; ATOM-NEXT: movd %r9d, %xmm2
-; ATOM-NEXT: movd %ecx, %xmm3
-; ATOM-NEXT: movd %r8d, %xmm4
+; ATOM-NEXT: movd %r9d, %xmm1
+; ATOM-NEXT: movd %ecx, %xmm2
+; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; ATOM-NEXT: movd %r8d, %xmm3
; ATOM-NEXT: movd %edx, %xmm1
-; ATOM-NEXT: LBB7_6:
-; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; ATOM-NEXT: LBB7_6:
; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0
; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1
; ATOM-NEXT: movq %xmm0, 16(%rsi)
diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll
index 2996edaec3e0..332bf2887fb0 100644
--- a/test/CodeGen/X86/setcc-wide-types.ll
+++ b/test/CodeGen/X86/setcc-wide-types.ll
@@ -58,25 +58,25 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-LABEL: ne_i256:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %r8
+; SSE2-NEXT: movq %xmm4, %rax
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %r9
-; SSE2-NEXT: movq %xmm0, %r10
-; SSE2-NEXT: movq %xmm1, %rsi
+; SSE2-NEXT: movq %xmm4, %rcx
+; SSE2-NEXT: movq %xmm0, %rdx
+; SSE2-NEXT: movq %xmm1, %r8
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rdi
+; SSE2-NEXT: xorq %rax, %rdi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: movq %xmm2, %rcx
-; SSE2-NEXT: movq %xmm3, %rdx
-; SSE2-NEXT: xorq %rsi, %rdx
-; SSE2-NEXT: xorq %r10, %rcx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: xorq %r9, %rax
-; SSE2-NEXT: xorq %r8, %rdi
-; SSE2-NEXT: orq %rax, %rdi
+; SSE2-NEXT: movq %xmm0, %rsi
+; SSE2-NEXT: xorq %rcx, %rsi
+; SSE2-NEXT: orq %rdi, %rsi
+; SSE2-NEXT: movq %xmm2, %rax
+; SSE2-NEXT: xorq %rdx, %rax
+; SSE2-NEXT: movq %xmm3, %rcx
+; SSE2-NEXT: xorq %r8, %rcx
+; SSE2-NEXT: orq %rax, %rcx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rcx, %rdi
+; SSE2-NEXT: orq %rsi, %rcx
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
@@ -100,25 +100,25 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-LABEL: eq_i256:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %r8
+; SSE2-NEXT: movq %xmm4, %rax
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %r9
-; SSE2-NEXT: movq %xmm0, %r10
-; SSE2-NEXT: movq %xmm1, %rsi
+; SSE2-NEXT: movq %xmm4, %rcx
+; SSE2-NEXT: movq %xmm0, %rdx
+; SSE2-NEXT: movq %xmm1, %r8
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rdi
+; SSE2-NEXT: xorq %rax, %rdi
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: movq %xmm2, %rcx
-; SSE2-NEXT: movq %xmm3, %rdx
-; SSE2-NEXT: xorq %rsi, %rdx
-; SSE2-NEXT: xorq %r10, %rcx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: xorq %r9, %rax
-; SSE2-NEXT: xorq %r8, %rdi
-; SSE2-NEXT: orq %rax, %rdi
+; SSE2-NEXT: movq %xmm0, %rsi
+; SSE2-NEXT: xorq %rcx, %rsi
+; SSE2-NEXT: orq %rdi, %rsi
+; SSE2-NEXT: movq %xmm2, %rax
+; SSE2-NEXT: xorq %rdx, %rax
+; SSE2-NEXT: movq %xmm3, %rcx
+; SSE2-NEXT: xorq %r8, %rcx
+; SSE2-NEXT: orq %rax, %rcx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rcx, %rdi
+; SSE2-NEXT: orq %rsi, %rcx
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
diff --git a/test/CodeGen/X86/shrink_vmul_sse.ll b/test/CodeGen/X86/shrink_vmul_sse.ll
index c869dff9e642..6701c247e6fc 100644
--- a/test/CodeGen/X86/shrink_vmul_sse.ll
+++ b/test/CodeGen/X86/shrink_vmul_sse.ll
@@ -20,9 +20,9 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; CHECK-NEXT: movzbl 1(%edx,%ecx), %edi
; CHECK-NEXT: movzbl (%edx,%ecx), %edx
; CHECK-NEXT: movzbl 1(%eax,%ecx), %ebx
+; CHECK-NEXT: imull %edi, %ebx
; CHECK-NEXT: movzbl (%eax,%ecx), %eax
; CHECK-NEXT: imull %edx, %eax
-; CHECK-NEXT: imull %edi, %ebx
; CHECK-NEXT: movl %ebx, 4(%esi,%ecx,4)
; CHECK-NEXT: movl %eax, (%esi,%ecx,4)
; CHECK-NEXT: popl %esi
diff --git a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
index d46082f20a45..cbd5c69b1772 100644
--- a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
+++ b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
@@ -5,9 +5,8 @@
define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind {
; AVX2-LABEL: foo2:
; AVX2: # BB#0:
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,1]
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX2-NEXT: vmovapd %xmm1, (%rdi)
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
+; AVX2-NEXT: vmovapd %xmm0, (%rdi)
; AVX2-NEXT: retq
%res = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
%res1 = shufflevector<2 x double> %res, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -18,9 +17,8 @@ define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind {
define <4 x double> @foo4(<4 x double> %v, <4 x double> *%p) nounwind {
; AVX2-LABEL: foo4:
; AVX2: # BB#0:
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,0,2,3]
-; AVX2-NEXT: vmovapd %ymm1, (%rdi)
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-NEXT: vmovapd %ymm0, (%rdi)
; AVX2-NEXT: retq
%res = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
%res1 = shufflevector<4 x double> %res, <4 x double> undef, <4 x i32> <i32 2, i32 0, i32 undef, i32 undef>
@@ -32,10 +30,8 @@ define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind {
; AVX2-LABEL: foo8:
; AVX2: # BB#0:
; AVX2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = <2,0,u,u,5,1,3,7>
-; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovapd %ymm1, (%rdi)
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-NEXT: vmovapd %ymm0, (%rdi)
; AVX2-NEXT: retq
%res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
%res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 undef, i32 undef, i32 5, i32 1, i32 3, i32 7>
@@ -46,7 +42,7 @@ define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind {
define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind {
; AVX2-LABEL: undef_splatmask:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -66,7 +62,7 @@ define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind {
define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind {
; AVX2-LABEL: undef_splatmask3:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3>
@@ -76,9 +72,10 @@ define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind {
define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind {
; AVX2-LABEL: undef_splatmask4:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX2-NEXT: vmovdqa %xmm1, %xmm0
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -89,9 +86,10 @@ define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind {
define <4 x i32> @undef_splatmask5(<4 x i32> %v, <4 x i32>* %p) nounwind {
; AVX2-LABEL: undef_splatmask5:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastq %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX2-NEXT: vmovdqa %xmm1, %xmm0
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3>
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 0b03dffe99b5..d99cfaf535de 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1537,9 +1537,9 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-NEXT: retl
;
@@ -1673,13 +1673,13 @@ define void @test_mm_setcsr(i32 %a0) nounwind {
define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
; X32-LABEL: test_mm_setr_ps:
; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_ps:
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index dfc1aefd31a6..68ab3f9f3205 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -66,7 +66,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
; X32-NEXT: jne .LBB1_8
; X32-NEXT: .LBB1_7:
; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT: jmp .LBB1_9
+; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT: je .LBB1_10
+; X32-NEXT: jmp .LBB1_11
; X32-NEXT: .LBB1_1:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
@@ -77,11 +80,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
; X32-NEXT: je .LBB1_7
; X32-NEXT: .LBB1_8: # %entry
; X32-NEXT: xorps %xmm3, %xmm3
-; X32-NEXT: .LBB1_9: # %entry
-; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: jne .LBB1_11
-; X32-NEXT: # BB#10:
+; X32-NEXT: .LBB1_10:
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: .LBB1_11: # %entry
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -103,7 +105,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
; X64-NEXT: jne .LBB1_8
; X64-NEXT: .LBB1_7:
; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X64-NEXT: jmp .LBB1_9
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-NEXT: testl %esi, %esi
+; X64-NEXT: je .LBB1_10
+; X64-NEXT: jmp .LBB1_11
; X64-NEXT: .LBB1_1:
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: testl %edx, %edx
@@ -114,11 +119,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
; X64-NEXT: je .LBB1_7
; X64-NEXT: .LBB1_8: # %entry
; X64-NEXT: xorps %xmm3, %xmm3
-; X64-NEXT: .LBB1_9: # %entry
-; X64-NEXT: testl %esi, %esi
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-NEXT: testl %esi, %esi
; X64-NEXT: jne .LBB1_11
-; X64-NEXT: # BB#10:
+; X64-NEXT: .LBB1_10:
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: .LBB1_11: # %entry
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
index 4d895ea264c5..aed5e0d1c32e 100644
--- a/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -412,14 +412,14 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
; SSE-NEXT: subss %xmm4, %xmm3
-; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE-NEXT: addss %xmm0, %xmm4
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT: addss %xmm0, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE-NEXT: addss %xmm0, %xmm1
-; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
@@ -431,12 +431,12 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX-NEXT: vaddss %xmm0, %xmm4, %xmm4
+; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 0
%2 = extractelement <4 x float> %B, i32 0
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 503b9416c8d3..4a0dc9c1eb17 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -273,8 +273,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
; X32: ## BB#0: ## %entry
; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X32-NEXT: addss %xmm1, %xmm0
; X32-NEXT: addss %xmm2, %xmm3
+; X32-NEXT: addss %xmm1, %xmm0
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
; X32-NEXT: retl
;
@@ -282,8 +282,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
; X64: ## BB#0: ## %entry
; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X64-NEXT: addss %xmm1, %xmm0
; X64-NEXT: addss %xmm2, %xmm3
+; X64-NEXT: addss %xmm1, %xmm0
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
; X64-NEXT: retq
entry:
@@ -896,9 +896,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT: addps %xmm1, %xmm0
; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
-; X32-NEXT: addps %xmm1, %xmm0
; X32-NEXT: addps %xmm2, %xmm3
; X32-NEXT: addps %xmm3, %xmm0
; X32-NEXT: retl
@@ -908,9 +908,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT: addps %xmm1, %xmm0
; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
-; X64-NEXT: addps %xmm1, %xmm0
; X64-NEXT: addps %xmm2, %xmm3
; X64-NEXT: addps %xmm3, %xmm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/stackmap-frame-setup.ll b/test/CodeGen/X86/stackmap-frame-setup.ll
index b83a8d61f6a2..df5ed5431b8a 100644
--- a/test/CodeGen/X86/stackmap-frame-setup.ll
+++ b/test/CodeGen/X86/stackmap-frame-setup.ll
@@ -7,11 +7,11 @@ entry:
store i64 11, i64* %metadata
store i64 12, i64* %metadata
store i64 13, i64* %metadata
-; ISEL: ADJCALLSTACKDOWN64 0, 0, implicit-def
+; ISEL: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def
; ISEL-NEXT: STACKMAP
; ISEL-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def
call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
-; FAST-ISEL: ADJCALLSTACKDOWN64 0, 0, implicit-def
+; FAST-ISEL: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def
; FAST-ISEL-NEXT: STACKMAP
; FAST-ISEL-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def
ret void
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index a42b3c96c3ae..1eef67764ab9 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -4344,7 +4344,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_4
; AVX1-NEXT: # BB#5:
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4
; AVX1-NEXT: jmp .LBB80_6
; AVX1-NEXT: .LBB80_4:
; AVX1-NEXT: movq %rax, %rcx
@@ -4352,22 +4352,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: andl $1, %eax
; AVX1-NEXT: orq %rcx, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm4
; AVX1-NEXT: .LBB80_6:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vmovq %xmm2, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_7
; AVX1-NEXT: # BB#8:
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
; AVX1-NEXT: jmp .LBB80_9
; AVX1-NEXT: .LBB80_7:
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: andl $1, %eax
; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
-; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX1-NEXT: .LBB80_9:
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-NEXT: testq %rax, %rax
@@ -4397,29 +4397,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5
; AVX1-NEXT: .LBB80_15:
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_16
; AVX1-NEXT: # BB#17:
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4
; AVX1-NEXT: jmp .LBB80_18
; AVX1-NEXT: .LBB80_16:
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: andl $1, %eax
; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
-; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4
+; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
; AVX1-NEXT: .LBB80_18:
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovq %xmm4, %rax
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vmovq %xmm3, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_19
; AVX1-NEXT: # BB#20:
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
; AVX1-NEXT: jmp .LBB80_21
; AVX1-NEXT: .LBB80_19:
; AVX1-NEXT: movq %rax, %rcx
@@ -4427,25 +4427,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: andl $1, %eax
; AVX1-NEXT: orq %rcx, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5
+; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: .LBB80_21:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
-; AVX1-NEXT: vpextrq $1, %xmm4, %rax
+; AVX1-NEXT: vpextrq $1, %xmm3, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_22
; AVX1-NEXT: # BB#23:
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1
; AVX1-NEXT: jmp .LBB80_24
; AVX1-NEXT: .LBB80_22:
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: andl $1, %eax
; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
-; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1
+; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB80_24:
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -4471,7 +4471,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_4
; AVX2-NEXT: # BB#5:
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4
; AVX2-NEXT: jmp .LBB80_6
; AVX2-NEXT: .LBB80_4:
; AVX2-NEXT: movq %rax, %rcx
@@ -4479,22 +4479,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm4
; AVX2-NEXT: .LBB80_6:
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-NEXT: vmovq %xmm2, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_7
; AVX2-NEXT: # BB#8:
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
; AVX2-NEXT: jmp .LBB80_9
; AVX2-NEXT: .LBB80_7:
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shrq %rcx
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
-; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
+; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
; AVX2-NEXT: .LBB80_9:
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-NEXT: testq %rax, %rax
@@ -4524,29 +4524,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5
; AVX2-NEXT: .LBB80_15:
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3]
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_16
; AVX2-NEXT: # BB#17:
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4
; AVX2-NEXT: jmp .LBB80_18
; AVX2-NEXT: .LBB80_16:
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shrq %rcx
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
-; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4
+; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
; AVX2-NEXT: .LBB80_18:
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vmovq %xmm4, %rax
+; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_19
; AVX2-NEXT: # BB#20:
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
; AVX2-NEXT: jmp .LBB80_21
; AVX2-NEXT: .LBB80_19:
; AVX2-NEXT: movq %rax, %rcx
@@ -4554,25 +4554,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0
-; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5
+; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX2-NEXT: .LBB80_21:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3]
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
-; AVX2-NEXT: vpextrq $1, %xmm4, %rax
+; AVX2-NEXT: vpextrq $1, %xmm3, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_22
; AVX2-NEXT: # BB#23:
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1
; AVX2-NEXT: jmp .LBB80_24
; AVX2-NEXT: .LBB80_22:
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shrq %rcx
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
-; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1
+; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .LBB80_24:
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0]
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
diff --git a/test/CodeGen/X86/vec_set-2.ll b/test/CodeGen/X86/vec_set-2.ll
index 443264cdffd4..51c8b2111107 100644
--- a/test/CodeGen/X86/vec_set-2.ll
+++ b/test/CodeGen/X86/vec_set-2.ll
@@ -1,11 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
define <4 x float> @test1(float %a) nounwind {
-; CHECK-LABEL: test1:
-; CHECK: # BB#0:
-; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: retl
+; X86-LABEL: test1:
+; X86: # BB#0:
+; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0
%tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1
%tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 2
@@ -14,10 +22,15 @@ define <4 x float> @test1(float %a) nounwind {
}
define <2 x i64> @test(i32 %a) nounwind {
-; CHECK-LABEL: test:
-; CHECK: # BB#0:
-; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: retl
+; X86-LABEL: test:
+; X86: # BB#0:
+; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: retq
%tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0
%tmp6 = insertelement <4 x i32> %tmp, i32 0, i32 1
%tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index ee4a08599968..b34f30924a8d 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -1,11 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X64
define <4 x float> @test(float %a) {
-; CHECK-LABEL: test:
-; CHECK: # BB#0:
-; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero
-; CHECK-NEXT: retl
+; X86-LABEL: test:
+; X86: # BB#0:
+; X86-NEXT: insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero
+; X86-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; X64-NEXT: retq
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
%tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2
%tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3
@@ -13,11 +19,17 @@ define <4 x float> @test(float %a) {
}
define <2 x i64> @test2(i32 %a) {
-; CHECK-LABEL: test2:
-; CHECK: # BB#0:
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; CHECK-NEXT: retl
+; X86-LABEL: test2:
+; X86: # BB#0:
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; X64-NEXT: retq
%tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2
%tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3
%tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
@@ -25,10 +37,15 @@ define <2 x i64> @test2(i32 %a) {
}
define <4 x float> @test3(<4 x float> %A) {
-; CHECK-LABEL: test3:
-; CHECK: # BB#0:
-; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
-; CHECK-NEXT: retl
+; X86-LABEL: test3:
+; X86: # BB#0:
+; X86-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; X86-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; X64-NEXT: retq
%tmp0 = extractelement <4 x float> %A, i32 0
%tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1
%tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2
diff --git a/test/CodeGen/X86/vec_set-4.ll b/test/CodeGen/X86/vec_set-4.ll
index 8f35529d61b4..09142e16aa6e 100644
--- a/test/CodeGen/X86/vec_set-4.ll
+++ b/test/CodeGen/X86/vec_set-4.ll
@@ -1,12 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
define <2 x i64> @test(i16 %a) nounwind {
-; CHECK-LABEL: test:
-; CHECK: # BB#0:
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT: retl
+; X86-LABEL: test:
+; X86: # BB#0:
+; X86-NEXT: pxor %xmm0, %xmm0
+; X86-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # BB#0:
+; X64-NEXT: pxor %xmm0, %xmm0
+; X64-NEXT: pinsrw $3, %edi, %xmm0
+; X64-NEXT: retq
%tmp10 = insertelement <8 x i16> zeroinitializer, i16 %a, i32 3
%tmp12 = insertelement <8 x i16> %tmp10, i16 0, i32 4
%tmp14 = insertelement <8 x i16> %tmp12, i16 0, i32 5
@@ -17,12 +24,19 @@ define <2 x i64> @test(i16 %a) nounwind {
}
define <2 x i64> @test2(i8 %a) nounwind {
-; CHECK-LABEL: test2:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: pinsrw $5, %eax, %xmm0
-; CHECK-NEXT: retl
+; X86-LABEL: test2:
+; X86: # BB#0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pxor %xmm0, %xmm0
+; X86-NEXT: pinsrw $5, %eax, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: pxor %xmm0, %xmm0
+; X64-NEXT: pinsrw $5, %eax, %xmm0
+; X64-NEXT: retq
%tmp24 = insertelement <16 x i8> zeroinitializer, i8 %a, i32 10
%tmp26 = insertelement <16 x i8> %tmp24, i8 0, i32 11
%tmp28 = insertelement <16 x i8> %tmp26, i8 0, i32 12
diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll
index 4429834b8ef0..3c9aca3a02da 100644
--- a/test/CodeGen/X86/vec_set-6.ll
+++ b/test/CodeGen/X86/vec_set-6.ll
@@ -1,13 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X64
define <4 x float> @test(float %a, float %b, float %c) nounwind {
-; CHECK-LABEL: test:
-; CHECK: # BB#0:
-; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
-; CHECK-NEXT: retl
+; X86-LABEL: test:
+; X86: # BB#0:
+; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # BB#0:
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT: xorps %xmm2, %xmm2
+; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
+; X64-NEXT: retq
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
%tmp8 = insertelement <4 x float> %tmp, float %b, i32 2
%tmp10 = insertelement <4 x float> %tmp8, float %c, i32 3
diff --git a/test/CodeGen/X86/vec_set-7.ll b/test/CodeGen/X86/vec_set-7.ll
index e8fe6debb140..757a0d44cd43 100644
--- a/test/CodeGen/X86/vec_set-7.ll
+++ b/test/CodeGen/X86/vec_set-7.ll
@@ -1,12 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
define <2 x i64> @test(<2 x i64>* %p) nounwind {
-; CHECK-LABEL: test:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: retl
+; X86-LABEL: test:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # BB#0:
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
%tmp = bitcast <2 x i64>* %p to double*
%tmp.upgrd.1 = load double, double* %tmp
%tmp.upgrd.2 = insertelement <2 x double> undef, double %tmp.upgrd.1, i32 0
diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll
index 7a4326c01bb7..a9dceb90855a 100644
--- a/test/CodeGen/X86/vec_set-8.ll
+++ b/test/CodeGen/X86/vec_set-8.ll
@@ -1,11 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
define <2 x i64> @test(i64 %i) nounwind {
-; CHECK-LABEL: test:
-; CHECK: # BB#0:
-; CHECK-NEXT: movq %rdi, %xmm0
-; CHECK-NEXT: retq
+; X86-LABEL: test:
+; X86: # BB#0:
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %xmm0
+; X64-NEXT: retq
%tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
%tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
ret <2 x i64> %tmp11
diff --git a/test/CodeGen/X86/vec_set-A.ll b/test/CodeGen/X86/vec_set-A.ll
index cae39a3d775b..259ace98d362 100644
--- a/test/CodeGen/X86/vec_set-A.ll
+++ b/test/CodeGen/X86/vec_set-A.ll
@@ -1,12 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
define <2 x i64> @test1() nounwind {
-; CHECK-LABEL: test1:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: retl
+; X86-LABEL: test1:
+; X86: # BB#0:
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0:
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: movq %rax, %xmm0
+; X64-NEXT: retq
ret <2 x i64> < i64 1, i64 0 >
}
diff --git a/test/CodeGen/X86/vec_set-B.ll b/test/CodeGen/X86/vec_set-B.ll
index 0580a3376656..ecd9b57cfd0c 100644
--- a/test/CodeGen/X86/vec_set-B.ll
+++ b/test/CodeGen/X86/vec_set-B.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; These should both generate something like this:
;_test3:
@@ -9,26 +10,37 @@
; ret
define <2 x i64> @test3(i64 %arg) nounwind {
-; CHECK-LABEL: test3:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl $1234567, %eax # imm = 0x12D687
-; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: retl
+; X86-LABEL: test3:
+; X86: # BB#0:
+; X86-NEXT: movl $1234567, %eax # imm = 0x12D687
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # BB#0:
+; X64-NEXT: andl $1234567, %edi # imm = 0x12D687
+; X64-NEXT: movq %rdi, %xmm0
+; X64-NEXT: retq
%A = and i64 %arg, 1234567
%B = insertelement <2 x i64> zeroinitializer, i64 %A, i32 0
ret <2 x i64> %B
}
define <2 x i64> @test2(i64 %arg) nounwind {
-; CHECK-LABEL: test2:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl $1234567, %eax # imm = 0x12D687
-; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: retl
+; X86-LABEL: test2:
+; X86: # BB#0:
+; X86-NEXT: movl $1234567, %eax # imm = 0x12D687
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd %eax, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # BB#0:
+; X64-NEXT: andl $1234567, %edi # imm = 0x12D687
+; X64-NEXT: movq %rdi, %xmm0
+; X64-NEXT: retq
%A = and i64 %arg, 1234567
%B = insertelement <2 x i64> undef, i64 %A, i32 0
ret <2 x i64> %B
}
-
diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll
index 994bc2b3056e..865e2fb83f17 100644
--- a/test/CodeGen/X86/vec_set-C.ll
+++ b/test/CodeGen/X86/vec_set-C.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2,-avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2,-avx | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2,-avx | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(i64 %x) nounwind {
-; X32-LABEL: t1:
-; X32: # BB#0:
-; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: retl
+; X86-LABEL: t1:
+; X86: # BB#0:
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: retl
;
; X64-LABEL: t1:
; X64: # BB#0:
diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll
index 49bd3beef75a..6439a6dcb00b 100644
--- a/test/CodeGen/X86/vec_set.ll
+++ b/test/CodeGen/X86/vec_set.ll
@@ -1,27 +1,48 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
-; CHECK-LABEL: test:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; CHECK-NEXT: movdqa %xmm3, (%eax)
-; CHECK-NEXT: retl
+; X86-LABEL: test:
+; X86: # BB#0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X86-NEXT: movdqa %xmm3, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # BB#0:
+; X64-NEXT: movd %r8d, %xmm0
+; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: movd %edx, %xmm1
+; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: movd %r9d, %xmm2
+; X64-NEXT: movd %esi, %xmm3
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X64-NEXT: movdqa %xmm3, (%rdi)
+; X64-NEXT: retq
%tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0
%tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1
%tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
index 226c0adbaf3c..2fb821555dba 100644
--- a/test/CodeGen/X86/vector-bitreverse.ll
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -2372,10 +2372,10 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm3
-; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index a05a981daa1f..f0a5fe1dbfff 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -848,10 +848,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: pandn %xmm5, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: blend_logic_v8i32:
@@ -860,10 +860,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm3
; SSSE3-NEXT: pandn %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: pand %xmm0, %xmm2
; SSSE3-NEXT: pandn %xmm4, %xmm0
; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: blend_logic_v8i32:
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index f1f795bf3cb0..e3261d15538f 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -1,15 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD
;
; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt.
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64:
@@ -194,16 +196,46 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
+; AVX512VLBWDQ-LABEL: testv2i64:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512VLCD-LABEL: testv2i64:
-; AVX512VLCD: ## BB#0:
+; AVX512VLCD: # BB#0:
; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv2i64:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv2i64:
@@ -429,16 +461,46 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
+; AVX512VLBWDQ-LABEL: testv2i64u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512VLCD-LABEL: testv2i64u:
-; AVX512VLCD: ## BB#0:
+; AVX512VLCD: # BB#0:
; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv2i64u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv2i64u:
@@ -651,16 +713,41 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
+; AVX512VLBWDQ-LABEL: testv4i32:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512VLCD-LABEL: testv4i32:
-; AVX512VLCD: ## BB#0:
+; AVX512VLCD: # BB#0:
; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv4i32:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv4i32:
@@ -867,16 +954,41 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
+; AVX512VLBWDQ-LABEL: testv4i32u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512VLCD-LABEL: testv4i32u:
-; AVX512VLCD: ## BB#0:
+; AVX512VLCD: # BB#0:
; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv4i32u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv4i32u:
@@ -1054,8 +1166,28 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
+; AVX512VLBWDQ-LABEL: testv8i16:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512VLCD-LABEL: testv8i16:
-; AVX512VLCD: ## BB#0:
+; AVX512VLCD: # BB#0:
; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0
@@ -1063,7 +1195,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv8i16:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
@@ -1238,8 +1370,28 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
+; AVX512VLBWDQ-LABEL: testv8i16u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512VLCD-LABEL: testv8i16u:
-; AVX512VLCD: ## BB#0:
+; AVX512VLCD: # BB#0:
; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0
@@ -1247,7 +1399,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv8i16u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # BB#0:
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
@@ -1399,8 +1551,23 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
+; AVX512VLBWDQ-LABEL: testv16i8:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512-LABEL: testv16i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vplzcntd %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
@@ -1546,8 +1713,23 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
+; AVX512VLBWDQ-LABEL: testv16i8u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm2, %xmm1
+; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512-LABEL: testv16i8u:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vplzcntd %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
@@ -1582,17 +1764,17 @@ define <2 x i64> @foldv2i64() nounwind {
; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv2i64:
-; AVX: # BB#0:
-; AVX-NEXT: movl $55, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: retq
+; NOBW-LABEL: foldv2i64:
+; NOBW: # BB#0:
+; NOBW-NEXT: movl $55, %eax
+; NOBW-NEXT: vmovq %rax, %xmm0
+; NOBW-NEXT: retq
;
-; AVX512-LABEL: foldv2i64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: movl $55, %eax
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: retq
+; AVX512VLBWDQ-LABEL: foldv2i64:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: movl $55, %eax
+; AVX512VLBWDQ-NEXT: vmovq %rax, %xmm0
+; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv2i64:
; X32-SSE: # BB#0:
@@ -1610,17 +1792,17 @@ define <2 x i64> @foldv2i64u() nounwind {
; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv2i64u:
-; AVX: # BB#0:
-; AVX-NEXT: movl $55, %eax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: retq
+; NOBW-LABEL: foldv2i64u:
+; NOBW: # BB#0:
+; NOBW-NEXT: movl $55, %eax
+; NOBW-NEXT: vmovq %rax, %xmm0
+; NOBW-NEXT: retq
;
-; AVX512-LABEL: foldv2i64u:
-; AVX512: ## BB#0:
-; AVX512-NEXT: movl $55, %eax
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: retq
+; AVX512VLBWDQ-LABEL: foldv2i64u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: movl $55, %eax
+; AVX512VLBWDQ-NEXT: vmovq %rax, %xmm0
+; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv2i64u:
; X32-SSE: # BB#0:
@@ -1637,15 +1819,15 @@ define <4 x i32> @foldv4i32() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv4i32:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
-; AVX-NEXT: retq
+; NOBW-LABEL: foldv4i32:
+; NOBW: # BB#0:
+; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; NOBW-NEXT: retq
;
-; AVX512-LABEL: foldv4i32:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
-; AVX512-NEXT: retq
+; AVX512VLBWDQ-LABEL: foldv4i32:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv4i32:
; X32-SSE: # BB#0:
@@ -1661,15 +1843,15 @@ define <4 x i32> @foldv4i32u() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv4i32u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
-; AVX-NEXT: retq
+; NOBW-LABEL: foldv4i32u:
+; NOBW: # BB#0:
+; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; NOBW-NEXT: retq
;
-; AVX512-LABEL: foldv4i32u:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
-; AVX512-NEXT: retq
+; AVX512VLBWDQ-LABEL: foldv4i32u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
+; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv4i32u:
; X32-SSE: # BB#0:
@@ -1685,15 +1867,15 @@ define <8 x i16> @foldv8i16() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
-; AVX-NEXT: retq
+; NOBW-LABEL: foldv8i16:
+; NOBW: # BB#0:
+; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; NOBW-NEXT: retq
;
-; AVX512-LABEL: foldv8i16:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
-; AVX512-NEXT: retq
+; AVX512VLBWDQ-LABEL: foldv8i16:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv8i16:
; X32-SSE: # BB#0:
@@ -1709,15 +1891,15 @@ define <8 x i16> @foldv8i16u() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv8i16u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
-; AVX-NEXT: retq
+; NOBW-LABEL: foldv8i16u:
+; NOBW: # BB#0:
+; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; NOBW-NEXT: retq
;
-; AVX512-LABEL: foldv8i16u:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
-; AVX512-NEXT: retq
+; AVX512VLBWDQ-LABEL: foldv8i16u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv8i16u:
; X32-SSE: # BB#0:
@@ -1733,15 +1915,15 @@ define <16 x i8> @foldv16i8() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv16i8:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
-; AVX-NEXT: retq
+; NOBW-LABEL: foldv16i8:
+; NOBW: # BB#0:
+; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; NOBW-NEXT: retq
;
-; AVX512-LABEL: foldv16i8:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
-; AVX512-NEXT: retq
+; AVX512VLBWDQ-LABEL: foldv16i8:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv16i8:
; X32-SSE: # BB#0:
@@ -1757,15 +1939,15 @@ define <16 x i8> @foldv16i8u() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv16i8u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
-; AVX-NEXT: retq
+; NOBW-LABEL: foldv16i8u:
+; NOBW: # BB#0:
+; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; NOBW-NEXT: retq
;
-; AVX512-LABEL: foldv16i8u:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
-; AVX512-NEXT: retq
+; AVX512VLBWDQ-LABEL: foldv16i8u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv16i8u:
; X32-SSE: # BB#0:
diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll
index 53cb4d8e445b..185e1f4865ea 100644
--- a/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -1,11 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD
;
; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt.
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32-AVX
define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
@@ -93,16 +95,76 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512VL-LABEL: testv4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: testv4i64:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512VLCD-LABEL: testv4i64:
-; AVX512VLCD: ## BB#0:
+; AVX512VLCD: # BB#0:
; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv4i64:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
;
; X32-AVX-LABEL: testv4i64:
@@ -225,16 +287,76 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512VL-LABEL: testv4i64u:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: testv4i64u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512VLCD-LABEL: testv4i64u:
-; AVX512VLCD: ## BB#0:
+; AVX512VLCD: # BB#0:
; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv4i64u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
;
; X32-AVX-LABEL: testv4i64u:
@@ -342,16 +464,66 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512VL-LABEL: testv8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: testv8i32:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512VLCD-LABEL: testv8i32:
-; AVX512VLCD: ## BB#0:
+; AVX512VLCD: # BB#0:
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv8i32:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
;
; X32-AVX-LABEL: testv8i32:
@@ -454,16 +626,66 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512VL-LABEL: testv8i32u:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: testv8i32u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512VLCD-LABEL: testv8i32u:
-; AVX512VLCD: ## BB#0:
+; AVX512VLCD: # BB#0:
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv8i32u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
;
; X32-AVX-LABEL: testv8i32u:
@@ -551,8 +773,48 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512VL-LABEL: testv16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: testv16i16:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512-LABEL: testv16i16:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512-NEXT: vplzcntd %zmm0, %zmm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
@@ -638,8 +900,48 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512VL-LABEL: testv16i16u:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: testv16i16u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512-LABEL: testv16i16u:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512-NEXT: vplzcntd %zmm0, %zmm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
@@ -710,8 +1012,38 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512VL-LABEL: testv32i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: testv32i8:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512-LABEL: testv32i8:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512-NEXT: vplzcntd %zmm1, %zmm1
@@ -784,8 +1116,38 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512VL-LABEL: testv32i8u:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: testv32i8u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VLBWDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VLBWDQ-NEXT: retq
+;
; AVX512-LABEL: testv32i8u:
-; AVX512: ## BB#0:
+; AVX512: # BB#0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512-NEXT: vplzcntd %zmm1, %zmm1
@@ -818,15 +1180,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
}
define <4 x i64> @foldv4i64() nounwind {
-; AVX-LABEL: foldv4i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56]
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: foldv4i64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56]
-; AVX512-NEXT: retq
+; X64-LABEL: foldv4i64:
+; X64: # BB#0:
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56]
+; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv4i64:
; X32-AVX: # BB#0:
@@ -837,15 +1194,10 @@ define <4 x i64> @foldv4i64() nounwind {
}
define <4 x i64> @foldv4i64u() nounwind {
-; AVX-LABEL: foldv4i64u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56]
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: foldv4i64u:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56]
-; AVX512-NEXT: retq
+; X64-LABEL: foldv4i64u:
+; X64: # BB#0:
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56]
+; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv4i64u:
; X32-AVX: # BB#0:
@@ -856,15 +1208,10 @@ define <4 x i64> @foldv4i64u() nounwind {
}
define <8 x i32> @foldv8i32() nounwind {
-; AVX-LABEL: foldv8i32:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: foldv8i32:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
-; AVX512-NEXT: retq
+; X64-LABEL: foldv8i32:
+; X64: # BB#0:
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv8i32:
; X32-AVX: # BB#0:
@@ -875,15 +1222,10 @@ define <8 x i32> @foldv8i32() nounwind {
}
define <8 x i32> @foldv8i32u() nounwind {
-; AVX-LABEL: foldv8i32u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: foldv8i32u:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
-; AVX512-NEXT: retq
+; X64-LABEL: foldv8i32u:
+; X64: # BB#0:
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
+; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv8i32u:
; X32-AVX: # BB#0:
@@ -894,15 +1236,15 @@ define <8 x i32> @foldv8i32u() nounwind {
}
define <16 x i16> @foldv16i16() nounwind {
-; AVX-LABEL: foldv16i16:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: foldv16i16:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; AVX512-NEXT: retq
+; NOBW-LABEL: foldv16i16:
+; NOBW: # BB#0:
+; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; NOBW-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: foldv16i16:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; AVX512VLBWDQ-NEXT: retq
;
; X32-AVX-LABEL: foldv16i16:
; X32-AVX: # BB#0:
@@ -913,15 +1255,15 @@ define <16 x i16> @foldv16i16() nounwind {
}
define <16 x i16> @foldv16i16u() nounwind {
-; AVX-LABEL: foldv16i16u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: foldv16i16u:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; AVX512-NEXT: retq
+; NOBW-LABEL: foldv16i16u:
+; NOBW: # BB#0:
+; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; NOBW-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: foldv16i16u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; AVX512VLBWDQ-NEXT: retq
;
; X32-AVX-LABEL: foldv16i16u:
; X32-AVX: # BB#0:
@@ -932,15 +1274,15 @@ define <16 x i16> @foldv16i16u() nounwind {
}
define <32 x i8> @foldv32i8() nounwind {
-; AVX-LABEL: foldv32i8:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: foldv32i8:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; AVX512-NEXT: retq
+; NOBW-LABEL: foldv32i8:
+; NOBW: # BB#0:
+; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; NOBW-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: foldv32i8:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; AVX512VLBWDQ-NEXT: retq
;
; X32-AVX-LABEL: foldv32i8:
; X32-AVX: # BB#0:
@@ -951,15 +1293,15 @@ define <32 x i8> @foldv32i8() nounwind {
}
define <32 x i8> @foldv32i8u() nounwind {
-; AVX-LABEL: foldv32i8u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: foldv32i8u:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; AVX512-NEXT: retq
+; NOBW-LABEL: foldv32i8u:
+; NOBW: # BB#0:
+; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; NOBW-NEXT: retq
+;
+; AVX512VLBWDQ-LABEL: foldv32i8u:
+; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; AVX512VLBWDQ-NEXT: retq
;
; X32-AVX-LABEL: foldv32i8u:
; X32-AVX: # BB#0:
diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll
new file mode 100644
index 000000000000..f737ea2b7fba
--- /dev/null
+++ b/test/CodeGen/X86/vector-narrow-binop.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
+
+; AVX1 has support for 256-bit bitwise logic because the FP variants were included.
+; If using those ops requires extra insert/extract though, it's probably not worth it.
+
+define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
+; SSE-LABEL: PR32790:
+; SSE: # BB#0:
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: psubd %xmm6, %xmm0
+; SSE-NEXT: psubd %xmm7, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: PR32790:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR32790:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: PR32790:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpsubd %ymm3, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %add = add <8 x i32> %a, %b
+ %and = and <8 x i32> %add, %c
+ %sub = sub <8 x i32> %and, %d
+ ret <8 x i32> %sub
+}
+
+; In a more extreme case, even the later AVX targets should avoid extract/insert just
+; because 256-bit ops are supported.
+
+define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
+; SSE-LABEL: do_not_use_256bit_op:
+; SSE: # BB#0:
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: do_not_use_256bit_op:
+; AVX1: # BB#0:
+; AVX1-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: do_not_use_256bit_op:
+; AVX2: # BB#0:
+; AVX2-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: do_not_use_256bit_op:
+; AVX512: # BB#0:
+; AVX512-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %and = and <8 x i32> %concat1, %concat2
+ %extract1 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %extract2 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %sub = sub <4 x i32> %extract1, %extract2
+ ret <4 x i32> %sub
+}
+
diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
index f05588a2920c..99a05c3d49c0 100644
--- a/test/CodeGen/X86/vector-pcmp.ll
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -148,8 +148,8 @@ define <32 x i8> @test_pcmpgtb_256(<32 x i8> %x) {
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -177,8 +177,8 @@ define <16 x i16> @test_pcmpgtw_256(<16 x i16> %x) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -206,8 +206,8 @@ define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -242,14 +242,13 @@ define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) {
;
; AVX1-LABEL: test_pcmpgtq_256:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index af3ddcf8048e..09e143ddcd4d 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -7,6 +7,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; 32-bit runs to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
;
; Variable Shifts
@@ -81,6 +85,41 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsravq %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2
+; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3
+; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsubq %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <4 x i64> %a, %b
ret <4 x i64> %shift
}
@@ -147,6 +186,41 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
+; X32-AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; X32-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; X32-AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6
+; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; X32-AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; X32-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; X32-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <8 x i32> %a, %b
ret <8 x i32> %shift
}
@@ -253,6 +327,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
+; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X32-AVX1-NEXT: vpsraw $8, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; X32-AVX1-NEXT: vpsraw $4, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsraw $2, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsraw $1, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
+; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
+; X32-AVX1-NEXT: vpsraw $8, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X32-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
+; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X32-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <16 x i16> %a, %b
ret <16 x i16> %shift
}
@@ -436,6 +559,89 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; X32-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
+; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
+; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; X32-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
+; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
+; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; X32-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
+; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
+; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <32 x i8> %a, %b
ret <32 x i8> %shift
}
@@ -499,6 +705,33 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax
+; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax
+; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
%shift = ashr <4 x i64> %a, %splat
ret <4 x i64> %shift
@@ -546,6 +779,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
%shift = ashr <8 x i32> %a, %splat
ret <8 x i32> %shift
@@ -593,6 +841,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
%shift = ashr <16 x i16> %a, %splat
ret <16 x i16> %shift
@@ -776,6 +1039,84 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm6
+; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm6
+; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm2
+; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X32-AVX1-NEXT: vpsraw $4, %xmm3, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsraw $2, %xmm3, %xmm5
+; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsraw $1, %xmm3, %xmm5
+; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1
+; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
+; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; X32-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
+; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
+; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; X32-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
+; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
+; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
%shift = ashr <32 x i8> %a, %splat
ret <32 x i8> %shift
@@ -843,6 +1184,43 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2
+; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3
+; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsubq %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
ret <4 x i64> %shift
}
@@ -893,6 +1271,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; X32-AVX1-NEXT: vpsrad $6, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpsrad $4, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpsrad $9, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpsrad $8, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsravd {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
ret <8 x i32> %shift
}
@@ -980,6 +1381,40 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsraw $4, %xmm1, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpsraw $2, %xmm1, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X32-AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; X32-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
+; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; X32-AVX2-NEXT: vpsravd %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
}
@@ -1149,6 +1584,81 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm6
+; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm6
+; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm2
+; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X32-AVX1-NEXT: vpsraw $4, %xmm3, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsraw $2, %xmm3, %xmm5
+; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsraw $1, %xmm3, %xmm5
+; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1
+; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; X32-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
+; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
+; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; X32-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
+; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
+; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <32 x i8> %shift
}
@@ -1206,6 +1716,25 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsraq $7, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT: vpsrad $7, %xmm1, %xmm2
+; X32-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrad $7, %ymm0, %ymm1
+; X32-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; X32-AVX2-NEXT: retl
%shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %shift
}
@@ -1246,6 +1775,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsrad $5, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrad $5, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %shift
}
@@ -1286,6 +1828,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsraw $3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsraw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
}
@@ -1352,6 +1907,31 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; X32-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
}
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index 60575250d713..46be36b76e98 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -7,6 +7,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; 32-bit runs to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
;
; Variable Shifts
@@ -59,6 +63,26 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <4 x i64> %a, %b
ret <4 x i64> %shift
}
@@ -125,6 +149,41 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
+; X32-AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; X32-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; X32-AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6
+; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; X32-AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; X32-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; X32-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <8 x i32> %a, %b
ret <8 x i32> %shift
}
@@ -231,6 +290,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
+; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X32-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
+; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
+; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X32-AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
+; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X32-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <16 x i16> %a, %b
ret <16 x i16> %shift
}
@@ -357,6 +465,56 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; X32-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X32-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <32 x i8> %a, %b
ret <32 x i8> %shift
}
@@ -401,6 +559,23 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax
+; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax
+; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
%shift = lshr <4 x i64> %a, %splat
ret <4 x i64> %shift
@@ -448,6 +623,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
%shift = lshr <8 x i32> %a, %splat
ret <8 x i32> %shift
@@ -495,6 +685,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
%shift = lshr <16 x i16> %a, %splat
ret <16 x i16> %shift
@@ -625,6 +830,55 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
+; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
+; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
%shift = lshr <32 x i8> %a, %splat
ret <32 x i8> %shift
@@ -677,6 +931,27 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrlvq {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
ret <4 x i64> %shift
}
@@ -727,6 +1002,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; X32-AVX1-NEXT: vpsrld $6, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpsrld $4, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpsrld $7, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpsrld $9, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpsrld $8, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrlvd {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
ret <8 x i32> %shift
}
@@ -814,6 +1112,40 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X32-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; X32-AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
+; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; X32-AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
}
@@ -930,6 +1262,52 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsrlw $2, %xmm1, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm1
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm1
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <32 x i8> %shift
}
@@ -974,6 +1352,19 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %shift
}
@@ -1014,6 +1405,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsrld $5, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrld $5, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %shift
}
@@ -1054,6 +1458,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
}
@@ -1103,6 +1520,23 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
}
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index 7f534050b6a7..4a134f440a78 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -7,6 +7,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+;
+; 32-bit runs to make sure we do reasonable things for i64 shifts.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
;
; Variable Shifts
@@ -56,6 +60,26 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <4 x i64> %a, %b
ret <4 x i64> %shift
}
@@ -105,6 +129,27 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; X32-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X32-AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpslld $23, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
+; X32-AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <8 x i32> %a, %b
ret <8 x i32> %shift
}
@@ -205,6 +250,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
+; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X32-AVX1-NEXT: vpsllw $8, %xmm4, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
+; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw $2, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw $1, %xmm2, %xmm4
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
+; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
+; X32-AVX1-NEXT: vpsllw $8, %xmm0, %xmm4
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsllw $1, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; X32-AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
+; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; X32-AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <16 x i16> %a, %b
ret <16 x i16> %shift
}
@@ -319,6 +413,52 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: var_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm3
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; X32-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw $2, %xmm2, %xmm3
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: var_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <32 x i8> %a, %b
ret <32 x i8> %shift
}
@@ -363,6 +503,23 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax
+; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax
+; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
%shift = shl <4 x i64> %a, %splat
ret <4 x i64> %shift
@@ -410,6 +567,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; X32-AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
%shift = shl <8 x i32> %a, %splat
ret <8 x i32> %shift
@@ -457,6 +629,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; X32-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
%shift = shl <16 x i16> %a, %splat
ret <16 x i16> %shift
@@ -577,6 +764,51 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatvar_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm3
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw $2, %xmm2, %xmm3
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; X32-AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm6
+; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; X32-AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm7
+; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatvar_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
+; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
%shift = shl <32 x i8> %a, %splat
ret <32 x i8> %shift
@@ -626,6 +858,27 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
+; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllvq {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
ret <4 x i64> %shift
}
@@ -666,6 +919,19 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllvd {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
ret <8 x i32> %shift
}
@@ -719,6 +985,19 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpmullw {{\.LCPI.*}}, %xmm0, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpmullw {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpmullw {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
}
@@ -827,6 +1106,48 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
+;
+; X32-AVX1-LABEL: constant_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsllw $2, %xmm1, %xmm2
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; X32-AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6
+; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
+; X32-AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm7
+; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: constant_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm1
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm1
+; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
ret <32 x i8> %shift
}
@@ -871,6 +1192,19 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v4i64:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpsllq $7, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v4i64:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllq $7, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %shift
}
@@ -911,6 +1245,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpslld $5, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v8i32:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpslld $5, %xmm0, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpslld $5, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v8i32:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpslld $5, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %shift
}
@@ -951,6 +1298,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v16i16:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm1
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v16i16:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
}
@@ -999,6 +1359,23 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
+;
+; X32-AVX1-LABEL: splatconstant_shift_v32i8:
+; X32-AVX1: # BB#0:
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
+; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
+; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
+; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: splatconstant_shift_v32i8:
+; X32-AVX2: # BB#0:
+; X32-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-AVX2-NEXT: retl
%shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 26cd7301fe60..7a5c992bb829 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -1,129 +1,235 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefixes=ALL,KNL %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefixes=ALL,SKX %s
target triple = "x86_64-unknown-unknown"
-define <32 x i16> @shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i16> %a) {
-; ALL-LABEL: shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; ALL: # BB#0:
-; ALL-NEXT: vpbroadcastw %xmm0, %zmm0
-; ALL-NEXT: retq
+define <32 x i16> @shuffle_v32i16(<32 x i16> %a) {
+; KNL-LABEL: shuffle_v32i16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpbroadcastw %xmm0, %zmm0
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> zeroinitializer
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<32 x i16> %a) {
-; ALL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
-; ALL: # BB#0:
-; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; ALL-NEXT: vpbroadcastw %xmm0, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; SKX: ## BB#0:
+; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; SKX-NEXT: vpbroadcastw %xmm0, %zmm0
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) {
-; ALL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
-; ALL: # BB#0:
-; ALL-NEXT: vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
-; ALL-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
+; KNL: ## BB#0:
+; KNL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
+; KNL-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
+; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255>
+; KNL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0
+; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
+; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u>
+; KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,255,255,255,255,255,255,255,255,u,u,255,255,255,255,u,u,255,255,0,0>
+; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
+; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1, i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 31>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) {
-; ALL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
-; ALL: # BB#0:
-; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
-; ALL-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; KNL-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; KNL-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1]
+; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
+; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7]
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm1
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm5
+; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7]
+; KNL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
+; KNL-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3
+; KNL-NEXT: vpbroadcastw %xmm3, %ymm3
+; KNL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; KNL-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
+; KNL-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
+; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24, i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 56>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
-; ALL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
-; ALL: # BB#0:
-; ALL-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
+; KNL: ## BB#0:
+; KNL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
+; SKX: ## BB#0:
+; SKX-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
-; ALL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
-; ALL: # BB#0:
-; ALL-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
+; KNL: ## BB#0:
+; KNL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
+; SKX: ## BB#0:
+; SKX-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z(<32 x i16> %a, <32 x i16> %b) {
-; ALL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
-; ALL: # BB#0:
-; ALL-NEXT: vpsrld $16, %zmm0, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrld $16, %ymm0, %ymm0
+; KNL-NEXT: vpsrld $16, %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsrld $16, %zmm0, %zmm0
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 34, i32 3, i32 34, i32 5, i32 34, i32 7, i32 34, i32 9, i32 34, i32 11, i32 34, i32 13, i32 34, i32 15, i32 34, i32 17, i32 34, i32 19, i32 34, i32 21, i32 34, i32 23, i32 34, i32 25, i32 34, i32 27, i32 34, i32 29, i32 34, i32 31, i32 34>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30(<32 x i16> %a, <32 x i16> %b) {
-; ALL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
-; ALL: # BB#0:
-; ALL-NEXT: vpslld $16, %zmm0, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $16, %ymm0, %ymm0
+; KNL-NEXT: vpslld $16, %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $16, %zmm0, %zmm0
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 34, i32 0, i32 34, i32 2, i32 34, i32 4, i32 34, i32 6, i32 34, i32 8, i32 34, i32 10, i32 34, i32 12, i32 34, i32 14, i32 34, i32 16, i32 34, i32 18, i32 34, i32 20, i32 34, i32 22, i32 34, i32 24, i32 34, i32 26, i32 34, i32 28, i32 34, i32 30>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) {
-; ALL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
-; ALL: # BB#0:
-; ALL-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
+; KNL: ## BB#0:
+; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
+; SKX: ## BB#0:
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
-; ALL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
-; ALL: # BB#0:
-; ALL-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
+; KNL: ## BB#0:
+; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
+; SKX: ## BB#0:
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
-; ALL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
-; ALL: # BB#0:
-; ALL-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
-; ALL-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
+; KNL: ## BB#0:
+; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
+; SKX: ## BB#0:
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
+; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
-; ALL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
-; ALL: # BB#0:
-; ALL-NEXT: movl $1, %eax
-; ALL-NEXT: kmovd %eax, %k1
-; ALL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
+; KNL: ## BB#0:
+; KNL-NEXT: movl $65535, %eax ## imm = 0xFFFF
+; KNL-NEXT: vmovd %eax, %xmm1
+; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
+; SKX: ## BB#0:
+; SKX-NEXT: movl $1, %eax
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
ret <32 x i16> %shuffle
}
define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
-; ALL-LABEL: insert_dup_mem_v32i16_i32:
-; ALL: # BB#0:
-; ALL-NEXT: movl (%rdi), %eax
-; ALL-NEXT: vpbroadcastw %ax, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: insert_dup_mem_v32i16_i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpbroadcastw (%rdi), %ymm0
+; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_dup_mem_v32i16_i32:
+; SKX: ## BB#0:
+; SKX-NEXT: movl (%rdi), %eax
+; SKX-NEXT: vpbroadcastw %ax, %zmm0
+; SKX-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
@@ -132,11 +238,19 @@ define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
}
define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
-; ALL-LABEL: insert_dup_mem_v32i16_sext_i16:
-; ALL: # BB#0:
-; ALL-NEXT: movswl (%rdi), %eax
-; ALL-NEXT: vpbroadcastw %ax, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: insert_dup_mem_v32i16_sext_i16:
+; KNL: ## BB#0:
+; KNL-NEXT: movswl (%rdi), %eax
+; KNL-NEXT: vmovd %eax, %xmm0
+; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_dup_mem_v32i16_sext_i16:
+; SKX: ## BB#0:
+; SKX-NEXT: movswl (%rdi), %eax
+; SKX-NEXT: vpbroadcastw %ax, %zmm0
+; SKX-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
%tmp1 = sext i16 %tmp to i32
%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
@@ -146,11 +260,17 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
}
define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 {
-; ALL-LABEL: insert_dup_elt1_mem_v32i16_i32:
-; ALL: # BB#0:
-; ALL-NEXT: movzwl 2(%rdi), %eax
-; ALL-NEXT: vpbroadcastw %ax, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: insert_dup_elt1_mem_v32i16_i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0
+; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32:
+; SKX: ## BB#0:
+; SKX-NEXT: movzwl 2(%rdi), %eax
+; SKX-NEXT: vpbroadcastw %ax, %zmm0
+; SKX-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
@@ -159,11 +279,17 @@ define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 {
}
define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 {
-; ALL-LABEL: insert_dup_elt3_mem_v32i16_i32:
-; ALL: # BB#0:
-; ALL-NEXT: movzwl 2(%rdi), %eax
-; ALL-NEXT: vpbroadcastw %ax, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: insert_dup_elt3_mem_v32i16_i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0
+; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32:
+; SKX: ## BB#0:
+; SKX-NEXT: movzwl 2(%rdi), %eax
+; SKX-NEXT: vpbroadcastw %ax, %zmm0
+; SKX-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
@@ -172,19 +298,79 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 {
}
define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) {
-; ALL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
-; ALL: # BB#0:
-; ALL-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-NEXT: vmovdqa %ymm2, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 36, i32 0, i32 0, i32 0, i32 37, i32 0, i32 0, i32 0, i32 38, i32 0, i32 0, i32 0, i32 39, i32 0, i32 0, i32 0>
ret <32 x i16> %shuffle
}
define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i16> %a) {
-; ALL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
-; ALL: # BB#0:
-; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; ALL-NEXT: retq
+; KNL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT: vmovdqa %ymm2, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 32, i32 0, i32 33, i32 0, i32 34, i32 0, i32 35, i32 0, i32 36, i32 0, i32 37, i32 0, i32 38, i32 0, i32 39, i32 0, i32 40, i32 0, i32 41, i32 0, i32 42, i32 0, i32 43, i32 0, i32 44, i32 0, i32 45, i32 0, i32 46, i32 0, i32 47, i32 0>
ret <32 x i16> %shuffle
}
+
+define <8 x i16> @pr32967(<32 x i16> %v) {
+; KNL-LABEL: pr32967:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; KNL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; KNL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: pr32967:
+; SKX: ## BB#0:
+; SKX-NEXT: vpextrw $5, %xmm0, %eax
+; SKX-NEXT: vpextrw $1, %xmm0, %ecx
+; SKX-NEXT: vmovd %ecx, %xmm1
+; SKX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; SKX-NEXT: vpextrw $1, %xmm2, %eax
+; SKX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; SKX-NEXT: vpextrw $5, %xmm2, %eax
+; SKX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; SKX-NEXT: vpextrw $1, %xmm2, %eax
+; SKX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
+; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; SKX-NEXT: vpextrw $1, %xmm0, %eax
+; SKX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; SKX-NEXT: vpextrw $5, %xmm0, %eax
+; SKX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> <i32 1,i32 5,i32 9,i32 13,i32 17,i32 21,i32 25,i32 29>
+ ret <8 x i16> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-sqrt.ll b/test/CodeGen/X86/vector-sqrt.ll
index c5ac4466b5fa..8081e9482d67 100644
--- a/test/CodeGen/X86/vector-sqrt.ll
+++ b/test/CodeGen/X86/vector-sqrt.ll
@@ -29,11 +29,11 @@ define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 {
; CHECK: # BB#0: # %entry
; CHECK-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1
-; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2
-; CHECK-NEXT: vsqrtss 12(%rdi), %xmm3, %xmm3
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm1
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; CHECK-NEXT: vsqrtss 12(%rdi), %xmm2, %xmm1
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; CHECK-NEXT: retq
entry:
%0 = load float, float* %v, align 4
diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll
index 34a9df1782a4..f5ec8e540b0b 100644
--- a/test/CodeGen/X86/viabs.ll
+++ b/test/CodeGen/X86/viabs.ll
@@ -405,16 +405,16 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
;
; AVX1-LABEL: test_abs_ge_v2i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_ge_v2i64:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -447,21 +447,20 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: test_abs_gt_v4i64:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_gt_v4i64:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -504,35 +503,31 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
; AVX1-LABEL: test_abs_le_v8i64:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6
+; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm5
+; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_le_v8i64:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
@@ -581,37 +576,33 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
; AVX1-NEXT: vmovdqu (%rdi), %ymm0
; AVX1-NEXT: vmovdqu 32(%rdi), %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6
+; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm5
+; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_le_v8i64_fold:
; AVX2: # BB#0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/vselect-pcmp.ll b/test/CodeGen/X86/vselect-pcmp.ll
index d33fda4f49c2..7807991b455d 100644
--- a/test/CodeGen/X86/vselect-pcmp.ll
+++ b/test/CodeGen/X86/vselect-pcmp.ll
@@ -35,9 +35,7 @@ define <8 x i16> @signbit_sel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask)
; AVX: # BB#0:
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpandn %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%tr = icmp slt <8 x i16> %mask, zeroinitializer
%z = select <8 x i1> %tr, <8 x i16> %x, <8 x i16> %y
@@ -162,18 +160,14 @@ define <16 x i16> @signbit_sel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: signbit_sel_v16i16:
; AVX512: # BB#0:
; AVX512-NEXT: vpxor %ymm3, %ymm3, %ymm3
; AVX512-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vpandn %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
%tr = icmp slt <16 x i16> %mask, zeroinitializer
%z = select <16 x i1> %tr, <16 x i16> %x, <16 x i16> %y
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 6fbec91e77a3..450e255313b3 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -11,13 +11,13 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
; AVX-NEXT: vmovupd 96(%rdi), %ymm3
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2
; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm1
-; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; AVX-NEXT: retq
%wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
%strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -39,11 +39,11 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
; AVX-NEXT: vmovupd 96(%rdi), %ymm3
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT: vmulpd %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0
; AVX-NEXT: retq
%wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
%strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -124,9 +124,9 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX2-NEXT: retq
%wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
index 7e370c25e31b..3052a0f615eb 100644
--- a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
+++ b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py for function "bar"
; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
-;; In functions with 'no_caller_saved_registers' attribute, all registers should
+;; In functions with 'no_caller_saved_registers' attribute, all registers should
;; be preserved except for registers used for passing/returning arguments.
;; In the following function registers %RDI, %RSI and %XMM0 are used to store
;; arguments %a0, %a1 and %b0 accordingally. The value is returned in %RAX.
@@ -28,20 +28,20 @@ define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 {
ret i32 4
}
-;; Because "bar" has 'no_caller_saved_registers' attribute, function "foo"
-;; doesn't need to preserve registers except for the arguments passed
+;; Because "bar" has 'no_caller_saved_registers' attribute, function "foo"
+;; doesn't need to preserve registers except for the arguments passed
;; to "bar" (%ESI, %EDI and %XMM0).
define x86_64_sysvcc float @foo(i32 %a0, i32 %a1, float %b0) {
-; CHECK-LABEL: foo
-; CHECK: movaps %xmm0, %xmm1
-; CHECK-NEXT: movl %esi, %ecx
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: callq bar
-; CHECK-NEXT: addl %edx, %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: cvtsi2ssl %eax, %xmm0
-; CHECK-NEXT: addss %xmm0, %xmm1
+; CHECK-LABEL: foo
+; CHECK: movaps %xmm0, %xmm1
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: callq bar
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtsi2ssl %eax, %xmm0
+; CHECK-NEXT: addss %xmm0, %xmm1
; CHECK: retq
%call = call i32 @bar(i32 %a0, i32 %a1, float %b0) #0
%c0 = add i32 %a0, %call
diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers.ll b/test/CodeGen/X86/x86-no_caller_saved_registers.ll
index 9c62e3ee6ba7..4e5403d1847f 100644
--- a/test/CodeGen/X86/x86-no_caller_saved_registers.ll
+++ b/test/CodeGen/X86/x86-no_caller_saved_registers.ll
@@ -1,31 +1,31 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s
-; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s
-; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; In functions with 'no_caller_saved_registers' attribute, all registers should
-;; be preserved except for registers used for passing/returning arguments.
-;; The test checks that function "bar" preserves xmm0 register.
-;; It also checks that caller function "foo" does not store registers for callee
-;; "bar". For example, there is no store/load/access to xmm registers.
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) #0 {
-; CHECK-LABEL: bar
-; CHECK: mov{{.*}} %xmm0
-; CHECK: mov{{.*}} {{.*}}, %xmm0
-; CHECK: ret
- call void asm sideeffect "", "~{xmm0}"()
- ret i32 1
-}
-
-define x86_intrcc void @foo(i8* nocapture readnone %c) {
-; CHECK-LABEL: foo
-; CHECK-NOT: xmm
-entry:
- tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) #0
- ret void
-}
-
-attributes #0 = { "no_caller_saved_registers" }
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; In functions with 'no_caller_saved_registers' attribute, all registers should
+;; be preserved except for registers used for passing/returning arguments.
+;; The test checks that function "bar" preserves xmm0 register.
+;; It also checks that caller function "foo" does not store registers for callee
+;; "bar". For example, there is no store/load/access to xmm registers.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) #0 {
+; CHECK-LABEL: bar
+; CHECK: mov{{.*}} %xmm0
+; CHECK: mov{{.*}} {{.*}}, %xmm0
+; CHECK: ret
+ call void asm sideeffect "", "~{xmm0}"()
+ ret i32 1
+}
+
+define x86_intrcc void @foo(i8* nocapture readnone %c) {
+; CHECK-LABEL: foo
+; CHECK-NOT: xmm
+entry:
+ tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) #0
+ ret void
+}
+
+attributes #0 = { "no_caller_saved_registers" }
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 5b6e773fe5d4..519f0d0924e3 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -270,8 +270,6 @@ if.end: ; preds = %if.else, %for.end
ret i32 %sum.1
}
-declare void @somethingElse(...)
-
; Check with a more complex case that we do not have restore within the loop and
; save outside.
; CHECK-LABEL: loopInfoRestoreOutsideLoop:
@@ -982,3 +980,54 @@ for.inc:
}
attributes #4 = { "no-frame-pointer-elim"="true" }
+
+@x = external global i32, align 4
+@y = external global i32, align 4
+
+; The post-dominator tree does not include the branch containing the infinite
+; loop, which can occur into a misplacement of the restore block, if we're
+; looking for the nearest common post-dominator of an "unreachable" block.
+
+; CHECK-LABEL: infiniteLoopNoSuccessor:
+; CHECK: ## BB#0:
+; Make sure the prologue happens in the entry block.
+; CHECK-NEXT: pushq %rbp
+; ...
+; Make sure we don't shrink-wrap.
+; CHECK: ## BB#1
+; CHECK-NOT: pushq %rbp
+; ...
+; Make sure the epilogue happens in the exit block.
+; CHECK: ## BB#5
+; CHECK: popq %rbp
+; CHECK-NEXT: retq
+define void @infiniteLoopNoSuccessor() #5 {
+ %1 = load i32, i32* @x, align 4
+ %2 = icmp ne i32 %1, 0
+ br i1 %2, label %3, label %4
+
+; <label>:3:
+ store i32 0, i32* @x, align 4
+ br label %4
+
+; <label>:4:
+ call void (...) @somethingElse()
+ %5 = load i32, i32* @y, align 4
+ %6 = icmp ne i32 %5, 0
+ br i1 %6, label %10, label %7
+
+; <label>:7:
+ %8 = call i32 (...) @something()
+ br label %9
+
+; <label>:9:
+ call void (...) @somethingElse()
+ br label %9
+
+; <label>:10:
+ ret void
+}
+
+declare void @somethingElse(...)
+
+attributes #5 = { nounwind "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
index a100a1425dd1..5f56e2d80d73 100644
--- a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
@@ -499,8 +499,8 @@ declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
; X32-LABEL: test_mm256_cmov_si256:
; X32: # BB#0:
-; X32-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; X32-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; X32-NEXT: vxorps %ymm3, %ymm3, %ymm3
+; X32-NEXT: vcmptrueps %ymm3, %ymm3, %ymm3
; X32-NEXT: vxorps %ymm3, %ymm2, %ymm3
; X32-NEXT: vandps %ymm2, %ymm0, %ymm0
; X32-NEXT: vandps %ymm3, %ymm1, %ymm1
@@ -509,8 +509,8 @@ define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>
;
; X64-LABEL: test_mm256_cmov_si256:
; X64: # BB#0:
-; X64-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; X64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; X64-NEXT: vxorps %ymm3, %ymm3, %ymm3
+; X64-NEXT: vcmptrueps %ymm3, %ymm3, %ymm3
; X64-NEXT: vxorps %ymm3, %ymm2, %ymm3
; X64-NEXT: vandps %ymm2, %ymm0, %ymm0
; X64-NEXT: vandps %ymm3, %ymm1, %ymm1
diff --git a/test/DebugInfo/COFF/local-variables.ll b/test/DebugInfo/COFF/local-variables.ll
index d1ad8767d413..c0bac0d174a9 100644
--- a/test/DebugInfo/COFF/local-variables.ll
+++ b/test/DebugInfo/COFF/local-variables.ll
@@ -28,7 +28,6 @@
; ASM: .seh_proc f
; ASM: # BB#0: # %entry
; ASM: subq $56, %rsp
-; ASM: #DEBUG_VALUE: f:param <- [%RSP+52]
; ASM: movl %ecx, 52(%rsp)
; ASM: [[prologue_end:\.Ltmp.*]]:
; ASM: .cv_loc 0 1 8 7 # t.cpp:8:7
@@ -36,8 +35,6 @@
; ASM: je .LBB0_2
; ASM: [[if_start:\.Ltmp.*]]:
; ASM: # BB#1: # %if.then
-; ASM: #DEBUG_VALUE: f:param <- [%RSP+52]
-; ASM: #DEBUG_VALUE: a <- [%RSP+40]
; ASM: .cv_loc 0 1 9 9 # t.cpp:9:9
; ASM: movl $42, 40(%rsp)
; ASM: [[inline_site1:\.Ltmp.*]]:
@@ -51,8 +48,6 @@
; ASM: jmp .LBB0_3
; ASM: [[else_start:\.Ltmp.*]]:
; ASM: .LBB0_2: # %if.else
-; ASM: #DEBUG_VALUE: f:param <- [%RSP+52]
-; ASM: #DEBUG_VALUE: b <- [%RSP+36]
; ASM: .cv_loc 0 1 13 9 # t.cpp:13:9
; ASM: movl $42, 36(%rsp)
; ASM: [[inline_site2:\.Ltmp.*]]:
diff --git a/test/DebugInfo/COFF/no-cus.ll b/test/DebugInfo/COFF/no-cus.ll
new file mode 100644
index 000000000000..349fe680de66
--- /dev/null
+++ b/test/DebugInfo/COFF/no-cus.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -filetype=obj -o %t.o
+; RUN: llvm-objdump -section-headers %t.o | FileCheck %s
+
+; Don't emit debug info in this scenario and don't crash.
+
+; CHECK-NOT: .debug$S
+; CHECK: .text
+; CHECK-NOT: .debug$S
+
+; ModuleID = 't.cpp'
+source_filename = "t.cpp"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.10.24728"
+
+define void @f() {
+entry:
+ ret void
+}
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 2, !"CodeView", i32 1}
+!1 = !{i32 1, !"PIC Level", i32 2}
+!2 = !{!"clang version 5.0.0 "}
diff --git a/test/DebugInfo/Inputs/typeunit-header.elf-x86-64 b/test/DebugInfo/Inputs/typeunit-header.elf-x86-64
new file mode 100644
index 000000000000..26fb0a5177d0
--- /dev/null
+++ b/test/DebugInfo/Inputs/typeunit-header.elf-x86-64
Binary files differ
diff --git a/test/DebugInfo/Inputs/typeunit-header.s b/test/DebugInfo/Inputs/typeunit-header.s
new file mode 100644
index 000000000000..802eb01c552c
--- /dev/null
+++ b/test/DebugInfo/Inputs/typeunit-header.s
@@ -0,0 +1,49 @@
+# Test object with an artifically constructed type unit header to verify
+# that the length field is correctly used to verify the validity of the
+# type_offset field.
+#
+# To generate the test object:
+# llvm-mc -triple x86_64-unknown-linux typeunit-header.s -filetype=obj \
+# -o typeunit-header.elf-x86-64
+#
+# We only have an abbreviation for the type unit die which is all we need.
+# Real type unit dies have quite different attributes of course, but we
+# just need to demonstrate an issue with validating length, so we just give it
+# a single visibility attribute.
+ .section .debug_abbrev,"",@progbits
+ .byte 0x01 # Abbrev code
+ .byte 0x41 # DW_TAG_type_unit
+ .byte 0x01 # DW_CHILDREN_yes
+ .byte 0x17 # DW_AT_visibility
+ .byte 0x0b # DW_FORM_data1
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+ .byte 0x02 # Abbrev code
+ .byte 0x13 # DW_TAG_structure_type
+ .byte 0x00 # DW_CHILDREN_no (no members)
+ .byte 0x17 # DW_AT_visibility
+ .byte 0x0b # DW_FORM_data1
+ .byte 0x00 # EOM(1)
+ .byte 0x00 # EOM(2)
+ .byte 0x00 # EOM(3)
+
+ .section .debug_types,"",@progbits
+# DWARF v4 Type unit header - DWARF32 format.
+TU_4_32_start:
+ .long TU_4_32_end-TU_4_32_version # Length of Unit
+TU_4_32_version:
+ .short 4 # DWARF version number
+ .long .debug_abbrev # Offset Into Abbrev. Section
+ .byte 8 # Address Size (in bytes)
+ .quad 0x0011223344556677 # Type Signature
+ .long TU_4_32_type-TU_4_32_start # Type offset
+# The type-unit DIE, which has just a visibility attribute.
+ .byte 1 # Abbreviation code
+ .byte 1 # DW_VIS_local
+# The type DIE, which also just has a one-byte visibility attribute.
+TU_4_32_type:
+ .byte 2 # Abbreviation code
+ .byte 1 # DW_VIS_local
+ .byte 0 # NULL
+ .byte 0 # NULL
+TU_4_32_end:
diff --git a/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test b/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test
index 0bb3e001d3a4..997cdd9f6bac 100644
--- a/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test
+++ b/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test
@@ -1,5 +1,5 @@
-; RUN: llvm-pdbdump pretty -symbols %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT_FPO %s
-; RUN: llvm-pdbdump pretty -symbols %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT %s
+; RUN: llvm-pdbdump pretty -module-syms %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT_FPO %s
+; RUN: llvm-pdbdump pretty -module-syms %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT %s
; RUN: llvm-pdbdump pretty -types %p/../Inputs/symbolformat.pdb > %t.types
; RUN: FileCheck --check-prefix=TYPES_FORMAT %s < %t.types
; RUN: FileCheck --check-prefix=TYPES_1 %s < %t.types
diff --git a/test/DebugInfo/X86/dbg-declare-inalloca.ll b/test/DebugInfo/X86/dbg-declare-inalloca.ll
new file mode 100644
index 000000000000..e3f5c7e629b8
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-declare-inalloca.ll
@@ -0,0 +1,199 @@
+; RUN: llc -O0 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DEBUG
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -filetype=obj -O0 < %s | llvm-readobj -codeview - | FileCheck %s --check-prefix=OBJ
+
+; IR generated by the following source:
+; struct NonTrivial {
+; NonTrivial();// : x(42) {}
+; ~NonTrivial();// {}
+; int x;
+; };
+; extern "C" void g(int);// {}
+; extern "C" void h(int);// {}
+; extern "C" void f(NonTrivial a, int b, int unused, int c) {
+; if (b) {
+; g(c);
+; } else {
+; h(a.x);
+; }
+; (void)unused;
+; }
+; //int main() {
+; // NonTrivial x;
+; // f(x, 1, 2, 3);
+; //}
+;
+; Remove C++ comments to have a complete, debuggable program.
+
+; We don't need (or want) DBG_VALUE instructions to describe the location of
+; inalloca arguments. We want frame indices in the side table, especially at
+; -O0, because they are reliable across the entire function and don't require
+; any propagation or analysis.
+
+; CHECK: _f: # @f
+; CHECK: Lfunc_begin0:
+; CHECK-NOT: DEBUG_VALUE
+; CHECK: [[start:Ltmp[0-9]+]]:
+; CHECK-NOT: DEBUG_VALUE
+; CHECK: cmpl
+; CHECK: calll _g
+; CHECK: calll _h
+; CHECK: jmp "??1NonTrivial@@QAE@XZ"
+; CHECK: [[end:Ltmp[0-9]+]]:
+; CHECK: Lfunc_end0:
+
+; FIXME: Optimized debug info should preserve this.
+; DEBUG: .short 4414 # Record kind: S_LOCAL
+; DEBUG: .asciz "a"
+; DEBUG: .cv_def_range [[start]] [[end]]
+
+; CHECK: .short 4414 # Record kind: S_LOCAL
+; CHECK: .asciz "b"
+; CHECK: .cv_def_range [[start]] [[end]]
+
+; CHECK: .short 4414 # Record kind: S_LOCAL
+; CHECK: .asciz "c"
+; CHECK: .cv_def_range [[start]] [[end]]
+
+; OBJ-LABEL: ProcStart {
+; OBJ: Kind: S_GPROC32_ID (0x1147)
+; OBJ: DisplayName: f
+; OBJ: }
+; OBJ: Local {
+; OBJ: Type: NonTrivial (0x1007)
+; OBJ: Flags [ (0x1)
+; OBJ: IsParameter (0x1)
+; OBJ: ]
+; OBJ: VarName: a
+; OBJ: }
+; OBJ: DefRangeRegisterRel {
+; OBJ: BaseRegister: 21
+; OBJ: BasePointerOffset: 12
+; OBJ: }
+; OBJ: Local {
+; OBJ: Type: int (0x74)
+; OBJ: Flags [ (0x1)
+; OBJ: IsParameter (0x1)
+; OBJ: ]
+; OBJ: VarName: b
+; OBJ: }
+; OBJ: DefRangeRegisterRel {
+; OBJ: BaseRegister: 21
+; OBJ: BasePointerOffset: 16
+; OBJ: }
+; FIXME: Retain unused.
+; OBJ: Local {
+; OBJ: Type: int (0x74)
+; OBJ: Flags [ (0x1)
+; OBJ: IsParameter (0x1)
+; OBJ: ]
+; OBJ: VarName: c
+; OBJ: }
+; OBJ: DefRangeRegisterRel {
+; OBJ: BaseRegister: 21
+; OBJ: BasePointerOffset: 24
+; OBJ: }
+; OBJ-LABEL: ProcEnd {
+; OBJ: }
+
+
+; ModuleID = 't.cpp'
+source_filename = "t.cpp"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc19.10.24728"
+
+%struct.NonTrivial = type { i32 }
+
+; Function Attrs: nounwind
+define void @f(<{ %struct.NonTrivial, i32, i32, i32 }>* inalloca) local_unnamed_addr #0 !dbg !7 {
+entry:
+ %a = getelementptr inbounds <{ %struct.NonTrivial, i32, i32, i32 }>, <{ %struct.NonTrivial, i32, i32, i32 }>* %0, i32 0, i32 0
+ %b = getelementptr inbounds <{ %struct.NonTrivial, i32, i32, i32 }>, <{ %struct.NonTrivial, i32, i32, i32 }>* %0, i32 0, i32 1
+ tail call void @llvm.dbg.declare(metadata i32* %c, metadata !20, metadata !24), !dbg !25
+ tail call void @llvm.dbg.declare(metadata i32* %b, metadata !22, metadata !24), !dbg !26
+ tail call void @llvm.dbg.declare(metadata %struct.NonTrivial* %a, metadata !23, metadata !24), !dbg !27
+ %1 = load i32, i32* %b, align 4, !dbg !28, !tbaa !30
+ %tobool = icmp eq i32 %1, 0, !dbg !28
+ br i1 %tobool, label %if.else, label %if.then, !dbg !34
+
+if.then: ; preds = %entry
+ %c = getelementptr inbounds <{ %struct.NonTrivial, i32, i32, i32 }>, <{ %struct.NonTrivial, i32, i32, i32 }>* %0, i32 0, i32 3
+ %2 = load i32, i32* %c, align 4, !dbg !35, !tbaa !30
+ tail call void @g(i32 %2) #4, !dbg !37
+ br label %if.end, !dbg !38
+
+if.else: ; preds = %entry
+ %x = getelementptr inbounds <{ %struct.NonTrivial, i32, i32, i32 }>, <{ %struct.NonTrivial, i32, i32, i32 }>* %0, i32 0, i32 0, i32 0, !dbg !39
+ %3 = load i32, i32* %x, align 4, !dbg !39, !tbaa !41
+ tail call void @h(i32 %3) #4, !dbg !43
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ tail call x86_thiscallcc void @"\01??1NonTrivial@@QAE@XZ"(%struct.NonTrivial* nonnull %a) #4, !dbg !44
+ ret void, !dbg !44
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @g(i32) local_unnamed_addr
+
+declare void @h(i32) local_unnamed_addr
+
+; Function Attrs: nounwind
+declare x86_thiscallcc void @"\01??1NonTrivial@@QAE@XZ"(%struct.NonTrivial*) unnamed_addr #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "t.cpp", directory: "C:\5Csrc\5Cllvm-project\5Cbuild", checksumkind: CSK_MD5, checksum: "e41e3fda2a91b52e121ed6c29a209eae")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"CodeView", i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{!"clang version 5.0.0 "}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !19)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10, !13, !13, !13}
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "NonTrivial", file: !1, line: 1, size: 32, elements: !11, identifier: ".?AUNonTrivial@@")
+!11 = !{!12, !14, !18}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !10, file: !1, line: 4, baseType: !13, size: 32)
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!14 = !DISubprogram(name: "NonTrivial", scope: !10, file: !1, line: 2, type: !15, isLocal: false, isDefinition: false, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true)
+!15 = !DISubroutineType(cc: DW_CC_BORLAND_thiscall, types: !16)
+!16 = !{null, !17}
+!17 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+!18 = !DISubprogram(name: "~NonTrivial", scope: !10, file: !1, line: 3, type: !15, isLocal: false, isDefinition: false, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true)
+!19 = !{!20, !21, !22, !23}
+!20 = !DILocalVariable(name: "c", arg: 4, scope: !7, file: !1, line: 8, type: !13)
+!21 = !DILocalVariable(name: "unused", arg: 3, scope: !7, file: !1, line: 8, type: !13)
+!22 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 8, type: !13)
+!23 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 8, type: !10)
+!24 = !DIExpression()
+!25 = !DILocation(line: 8, column: 56, scope: !7)
+!26 = !DILocation(line: 8, column: 37, scope: !7)
+!27 = !DILocation(line: 8, column: 30, scope: !7)
+!28 = !DILocation(line: 9, column: 7, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !7, file: !1, line: 9, column: 7)
+!30 = !{!31, !31, i64 0}
+!31 = !{!"int", !32, i64 0}
+!32 = !{!"omnipotent char", !33, i64 0}
+!33 = !{!"Simple C++ TBAA"}
+!34 = !DILocation(line: 9, column: 7, scope: !7)
+!35 = !DILocation(line: 10, column: 7, scope: !36)
+!36 = distinct !DILexicalBlock(scope: !29, file: !1, line: 9, column: 10)
+!37 = !DILocation(line: 10, column: 5, scope: !36)
+!38 = !DILocation(line: 11, column: 3, scope: !36)
+!39 = !DILocation(line: 12, column: 9, scope: !40)
+!40 = distinct !DILexicalBlock(scope: !29, file: !1, line: 11, column: 10)
+!41 = !{!42, !31, i64 0}
+!42 = !{!"?AUNonTrivial@@", !31, i64 0}
+!43 = !DILocation(line: 12, column: 5, scope: !40)
+!44 = !DILocation(line: 15, column: 1, scope: !7)
diff --git a/test/DebugInfo/X86/split-dwarf-cross-unit-reference.ll b/test/DebugInfo/X86/split-dwarf-cross-unit-reference.ll
index c6f0afa27937..ca8525cd335b 100644
--- a/test/DebugInfo/X86/split-dwarf-cross-unit-reference.ll
+++ b/test/DebugInfo/X86/split-dwarf-cross-unit-reference.ll
@@ -1,46 +1,194 @@
-; RUN: llc -mtriple=x86_64-linux -split-dwarf-file=foo.dwo -filetype=obj -o - < %s | llvm-objdump -r - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -split-dwarf-cross-cu-references -split-dwarf-file=foo.dwo -filetype=obj -o %t < %s
+; RUN: llvm-objdump -r %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info.dwo %t | FileCheck --check-prefix=ALL --check-prefix=INFO --check-prefix=DWO --check-prefix=CROSS %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck --check-prefix=ALL --check-prefix=INFO %s
+
+; RUN: llc -mtriple=x86_64-linux -split-dwarf-file=foo.dwo -filetype=obj -o %t < %s
+; RUN: llvm-objdump -r %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info.dwo %t | FileCheck --check-prefix=ALL --check-prefix=DWO --check-prefix=NOCROSS %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck --check-prefix=ALL --check-prefix=INFO %s
+
+; Testing cross-CU references for types, subprograms, and variables
+; Built from code something like this:
+; foo.cpp:
+; struct t1 { int i; };
+; void f();
+; __attribute__((always_inline)) void f1(t1 t) {
+; f();
+; }
+; void foo(t1 t) {
+; f1(t);
+; }
+; bar.cpp:
+; struct t1 { int i; };
+; void f1(t1);
+; void bar(t1 t) {
+; f1(t);
+; }
+; $ clang++-tot -emit-llvm -S {foo,bar}.cpp -g
+; $ llvm-link-tot {foo,bar}.ll -S -o foobar.ll
+; $ clang++-tot -emit-llvm foobar.ll -o foobar.opt.ll -S -c
+;
+; Then manually removing the original f1 definition, to simplify the DWARF a bit
+; (so it only has the inlined definitions, no concrete definition)
+
+; Check that:
+; * no relocations are emitted for the debug_info.dwo section no matter what
+; * one debug_info->debug_info relocation in debug_info no matter what (for
+; split dwarf inlining)
+; * debug_info uses relocations and ref_addr no matter what
+; * debug_info.dwo uses relocations for types as well as abstract subprograms
+; and variables when -split-dwarf-cross-cu-references is used
+; * debug_info.dwo contains duplicate types, abstract subprograms and abstract
+; variables otherwise to avoid the need for cross-cu references
; CHECK-NOT: .rel{{a?}}.debug_info.dwo
; CHECK: RELOCATION RECORDS FOR [.rel{{a?}}.debug_info]:
; CHECK-NOT: RELOCATION RECORDS
-; Expect one relocation in debug_info, between f3 and f1.
+; Expect one relocation in debug_info, from the inlined f1 in foo to its
+; abstract origin in bar
; CHECK: R_X86_64_32 .debug_info
+; CHECK-NOT: RELOCATION RECORDS
; CHECK-NOT: .debug_info
; CHECK: RELOCATION RECORDS
; CHECK-NOT: .rel{{a?}}.debug_info.dwo
+; ALL: Compile Unit
+; ALL: DW_TAG_compile_unit
+; DWO: DW_AT_name {{.*}} "foo.cpp"
+; ALL: 0x[[F1:.*]]: DW_TAG_subprogram
+; ALL: DW_AT_name {{.*}} "f1"
+; DWO: 0x[[F1T:.*]]: DW_TAG_formal_parameter
+; DWO: DW_AT_name {{.*}} "t"
+; DWO: DW_AT_type [DW_FORM_ref4] {{.*}}{0x[[T1:.*]]}
+; DWO: NULL
+; DWO: 0x[[T1]]: DW_TAG_structure_type
+; DWO: DW_AT_name {{.*}} "t1"
+; ALL: DW_TAG_subprogram
+; ALL: DW_AT_name {{.*}} "foo"
+; DWO: DW_TAG_formal_parameter
+; DWO: DW_AT_name {{.*}} "t"
+; DWO: DW_AT_type [DW_FORM_ref4] {{.*}}{0x[[T1]]}
+; ALL: DW_TAG_inlined_subroutine
+; ALL: DW_AT_abstract_origin [DW_FORM_ref4] {{.*}}{0x[[F1]]}
+; DWO: DW_TAG_formal_parameter
+; DWO: DW_AT_abstract_origin [DW_FORM_ref4] {{.*}}{0x[[F1T]]}
+
+; ALL: Compile Unit
+; ALL: DW_TAG_compile_unit
+; DWO: DW_AT_name {{.*}} "bar.cpp"
+; NOCROSS: 0x[[BAR_F1:.*]]: DW_TAG_subprogram
+; NOCROSS: DW_AT_name {{.*}} "f1"
+; NOCROSS: 0x[[BAR_F1T:.*]]: DW_TAG_formal_parameter
+; NOCROSS: DW_AT_name {{.*}} "t"
+; NOCROSS: DW_AT_type [DW_FORM_ref4] {{.*}}{0x[[BAR_T1:.*]]}
+; NOCROSS: NULL
+; NOCROSS: 0x[[BAR_T1]]: DW_TAG_structure_type
+; NOCROSS: DW_AT_name {{.*}} "t1"
+; ALL: DW_TAG_subprogram
+; ALL: DW_AT_name {{.*}} "bar"
+; DWO: DW_TAG_formal_parameter
+; DWO: DW_AT_name {{.*}} "t"
+; CROSS: DW_AT_type [DW_FORM_ref_addr] (0x00000000[[T1]]
+; NOCROSS: DW_AT_type [DW_FORM_ref4] {{.*}}{0x[[BAR_T1]]}
+; ALL: DW_TAG_inlined_subroutine
+; INFO: DW_AT_abstract_origin [DW_FORM_ref_addr] (0x00000000[[F1]]
+; NOCROSS: DW_AT_abstract_origin [DW_FORM_ref4] {{.*}}{0x[[BAR_F1]]}
+; DWO: DW_TAG_formal_parameter
+; CROSS: DW_AT_abstract_origin [DW_FORM_ref_addr] (0x00000000[[F1T]]
+; NOCROSS: DW_AT_abstract_origin [DW_FORM_ref4] {{.*}}{0x[[BAR_F1T]]
-; Function Attrs: noinline nounwind optnone uwtable
-define void @_Z2f1v() !dbg !7 {
+%struct.t1 = type { i32 }
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @_Z1fv() #2
+
+; Function Attrs: noinline uwtable
+define void @_Z3foo2t1(i32 %t.coerce) #3 !dbg !20 {
entry:
- ret void, !dbg !10
+ %t.i = alloca %struct.t1, align 4
+ call void @llvm.dbg.declare(metadata %struct.t1* %t.i, metadata !15, metadata !16), !dbg !21
+ %t = alloca %struct.t1, align 4
+ %agg.tmp = alloca %struct.t1, align 4
+ %coerce.dive = getelementptr inbounds %struct.t1, %struct.t1* %t, i32 0, i32 0
+ store i32 %t.coerce, i32* %coerce.dive, align 4
+ call void @llvm.dbg.declare(metadata %struct.t1* %t, metadata !23, metadata !16), !dbg !24
+ %0 = bitcast %struct.t1* %agg.tmp to i8*, !dbg !25
+ %1 = bitcast %struct.t1* %t to i8*, !dbg !25
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 4, i32 4, i1 false), !dbg !25
+ %coerce.dive1 = getelementptr inbounds %struct.t1, %struct.t1* %agg.tmp, i32 0, i32 0, !dbg !26
+ %2 = load i32, i32* %coerce.dive1, align 4, !dbg !26
+ %coerce.dive.i = getelementptr inbounds %struct.t1, %struct.t1* %t.i, i32 0, i32 0
+ store i32 %2, i32* %coerce.dive.i, align 4
+ call void @_Z1fv(), !dbg !27
+ ret void, !dbg !28
}
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #4
+
; Function Attrs: noinline uwtable
-define void @_Z2f3v() !dbg !13 {
+define void @_Z3bar2t1(i32 %t.coerce) #3 !dbg !29 {
entry:
- call void @_Z2f1v(), !dbg !14
- ret void, !dbg !16
+ %t.i = alloca %struct.t1, align 4
+ call void @llvm.dbg.declare(metadata %struct.t1* %t.i, metadata !15, metadata !16), !dbg !30
+ %t = alloca %struct.t1, align 4
+ %agg.tmp = alloca %struct.t1, align 4
+ %coerce.dive = getelementptr inbounds %struct.t1, %struct.t1* %t, i32 0, i32 0
+ store i32 %t.coerce, i32* %coerce.dive, align 4
+ call void @llvm.dbg.declare(metadata %struct.t1* %t, metadata !32, metadata !16), !dbg !33
+ %0 = bitcast %struct.t1* %agg.tmp to i8*, !dbg !34
+ %1 = bitcast %struct.t1* %t to i8*, !dbg !34
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 4, i32 4, i1 false), !dbg !34
+ %coerce.dive1 = getelementptr inbounds %struct.t1, %struct.t1* %agg.tmp, i32 0, i32 0, !dbg !35
+ %2 = load i32, i32* %coerce.dive1, align 4, !dbg !35
+ %coerce.dive.i = getelementptr inbounds %struct.t1, %struct.t1* %t.i, i32 0, i32 0
+ store i32 %2, i32* %coerce.dive.i, align 4
+ call void @_Z1fv(), !dbg !36
+ ret void, !dbg !37
}
!llvm.dbg.cu = !{!0, !3}
!llvm.ident = !{!5, !5}
-!llvm.module.flags = !{!6}
+!llvm.module.flags = !{!6, !7}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 301051) (llvm/trunk 301062)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "a.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 302809) (llvm/trunk 302815)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: true)
+!1 = !DIFile(filename: "foo.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
!2 = !{}
-!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 301051) (llvm/trunk 301062)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!4 = !DIFile(filename: "b.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
-!5 = !{!"clang version 5.0.0 (trunk 301051) (llvm/trunk 301062)"}
-!6 = !{i32 2, !"Debug Info Version", i32 3}
-!7 = distinct !DISubprogram(name: "f1", linkageName: "_Z2f1v", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null}
-!10 = !DILocation(line: 1, scope: !7)
-!11 = distinct !DISubprogram(name: "f2", linkageName: "_Z2f2v", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
-!12 = !DILocation(line: 1, scope: !11)
-!13 = distinct !DISubprogram(name: "f3", linkageName: "_Z2f3v", scope: !4, file: !4, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !3, variables: !2)
-!14 = !DILocation(line: 1, scope: !11, inlinedAt: !15)
-!15 = distinct !DILocation(line: 1, scope: !13)
-!16 = !DILocation(line: 1, scope: !13)
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 302809) (llvm/trunk 302815)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: true)
+!4 = !DIFile(filename: "bar.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!5 = !{!"clang version 5.0.0 (trunk 302809) (llvm/trunk 302815)"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = distinct !DISubprogram(name: "f1", linkageName: "_Z2f12t1", scope: !1, file: !1, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null, !11}
+!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", file: !1, line: 1, size: 32, elements: !12, identifier: "_ZTS2t1")
+!12 = !{!13}
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !11, file: !1, line: 1, baseType: !14, size: 32)
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!15 = !DILocalVariable(name: "t", arg: 1, scope: !8, file: !1, line: 3, type: !11)
+!16 = !DIExpression()
+!17 = !DILocation(line: 3, column: 43, scope: !8)
+!18 = !DILocation(line: 4, column: 3, scope: !8)
+!19 = !DILocation(line: 5, column: 1, scope: !8)
+!20 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foo2t1", scope: !1, file: !1, line: 6, type: !9, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!21 = !DILocation(line: 3, column: 43, scope: !8, inlinedAt: !22)
+!22 = distinct !DILocation(line: 7, column: 3, scope: !20)
+!23 = !DILocalVariable(name: "t", arg: 1, scope: !20, file: !1, line: 6, type: !11)
+!24 = !DILocation(line: 6, column: 13, scope: !20)
+!25 = !DILocation(line: 7, column: 6, scope: !20)
+!26 = !DILocation(line: 7, column: 3, scope: !20)
+!27 = !DILocation(line: 4, column: 3, scope: !8, inlinedAt: !22)
+!28 = !DILocation(line: 8, column: 1, scope: !20)
+!29 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bar2t1", scope: !4, file: !4, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !3, variables: !2)
+!30 = !DILocation(line: 3, column: 43, scope: !8, inlinedAt: !31)
+!31 = distinct !DILocation(line: 4, column: 3, scope: !29)
+!32 = !DILocalVariable(name: "t", arg: 1, scope: !29, file: !4, line: 3, type: !11)
+!33 = !DILocation(line: 3, column: 13, scope: !29)
+!34 = !DILocation(line: 4, column: 6, scope: !29)
+!35 = !DILocation(line: 4, column: 3, scope: !29)
+!36 = !DILocation(line: 4, column: 3, scope: !8, inlinedAt: !31)
+!37 = !DILocation(line: 5, column: 1, scope: !29)
diff --git a/test/DebugInfo/typeunit-header.test b/test/DebugInfo/typeunit-header.test
new file mode 100644
index 000000000000..c16156b91e6f
--- /dev/null
+++ b/test/DebugInfo/typeunit-header.test
@@ -0,0 +1,15 @@
+RUN: llvm-dwarfdump %p/Inputs/typeunit-header.elf-x86-64 | FileCheck %s
+
+This is testing a bugfix where parsing the type unit header was not
+taking the unit's intial length field into account when validating.
+
+The input file is hand-coded assembler to generate a type unit stub,
+which only contains a type unit DIE with a sole visibility attribute.
+
+We make sure that llvm-dwarfdump is parsing the type unit header correctly
+and displays it.
+
+CHECK: .debug_types contents:
+CHECK: 0x00000000: Type Unit: length = 0x00000019 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = '' type_signature = 0x0011223344556677 type_offset = 0x0019 (next unit at 0x0000001d)
+CHECK: 0x00000017: DW_TAG_type_unit [1] *
+CHECK: DW_AT_visibility [DW_FORM_data1] (DW_VIS_local)
diff --git a/test/Feature/intrinsic-noduplicate.ll b/test/Feature/intrinsic-noduplicate.ll
index 4f2ae1c698c9..f7b377aae38b 100644
--- a/test/Feature/intrinsic-noduplicate.ll
+++ b/test/Feature/intrinsic-noduplicate.ll
@@ -1,4 +1,5 @@
; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; REQUIRES: NVPTX
; Make sure LLVM knows about the convergent attribute on the
; llvm.nvvm.barrier0 intrinsic.
diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll
index 4b208d64427b..334e00dabf40 100644
--- a/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll
@@ -617,70 +617,6 @@ declare i32 @llvm.bswap.i32(i32) nounwind readnone
; CHECK-NOT: call void @__msan_warning
; CHECK: ret i32
-
-; Store intrinsic.
-
-define void @StoreIntrinsic(i8* %p, <4 x float> %x) nounwind uwtable sanitize_memory {
- call void @llvm.x86.sse.storeu.ps(i8* %p, <4 x float> %x)
- ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
-; CHECK-LABEL: @StoreIntrinsic
-; CHECK-NOT: br
-; CHECK-NOT: = or
-; CHECK: store <4 x i32> {{.*}} align 1
-; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
-; CHECK: ret void
-
-
-; Load intrinsic.
-
-define <16 x i8> @LoadIntrinsic(i8* %p) nounwind uwtable sanitize_memory {
- %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p)
- ret <16 x i8> %call
-}
-
-declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p) nounwind
-
-; CHECK-LABEL: @LoadIntrinsic
-; CHECK: load <16 x i8>, <16 x i8>* {{.*}} align 1
-; CHECK-ORIGINS: [[ORIGIN:%[01-9a-z]+]] = load i32, i32* {{.*}}
-; CHECK-NOT: br
-; CHECK-NOT: = or
-; CHECK: call <16 x i8> @llvm.x86.sse3.ldu.dq
-; CHECK: store <16 x i8> {{.*}} @__msan_retval_tls
-; CHECK-ORIGINS: store i32 {{.*}}[[ORIGIN]], i32* @__msan_retval_origin_tls
-; CHECK: ret <16 x i8>
-
-
-; Simple NoMem intrinsic
-; Check that shadow is OR'ed, and origin is Select'ed
-; And no shadow checks!
-
-define <8 x i16> @Paddsw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable sanitize_memory {
- %call = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b)
- ret <8 x i16> %call
-}
-
-declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) nounwind
-
-; CHECK-LABEL: @Paddsw128
-; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls
-; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls
-; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls
-; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls
-; CHECK-NEXT: = or <8 x i16>
-; CHECK-ORIGINS: = bitcast <8 x i16> {{.*}} to i128
-; CHECK-ORIGINS-NEXT: = icmp ne i128 {{.*}}, 0
-; CHECK-ORIGINS-NEXT: = select i1 {{.*}}, i32 {{.*}}, i32
-; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.padds.w
-; CHECK-NEXT: store <8 x i16> {{.*}} @__msan_retval_tls
-; CHECK-ORIGINS: store i32 {{.*}} @__msan_retval_origin_tls
-; CHECK-NEXT: ret <8 x i16>
-
-
; Test handling of vectors of pointers.
; Check that shadow of such vector is a vector of integers.
diff --git a/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll b/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll
new file mode 100644
index 000000000000..be3f1976daa1
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s
+; REQUIRES: x86
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Store intrinsic.
+
+define void @StoreIntrinsic(i8* %p, <4 x float> %x) nounwind uwtable sanitize_memory {
+ call void @llvm.x86.sse.storeu.ps(i8* %p, <4 x float> %x)
+ ret void
+}
+
+declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
+
+; CHECK-LABEL: @StoreIntrinsic
+; CHECK-NOT: br
+; CHECK-NOT: = or
+; CHECK: store <4 x i32> {{.*}} align 1
+; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
+; CHECK: ret void
+
+
+; Load intrinsic.
+
+define <16 x i8> @LoadIntrinsic(i8* %p) nounwind uwtable sanitize_memory {
+ %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p)
+ ret <16 x i8> %call
+}
+
+declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %p) nounwind
+
+; CHECK-LABEL: @LoadIntrinsic
+; CHECK: load <16 x i8>, <16 x i8>* {{.*}} align 1
+; CHECK-ORIGINS: [[ORIGIN:%[01-9a-z]+]] = load i32, i32* {{.*}}
+; CHECK-NOT: br
+; CHECK-NOT: = or
+; CHECK: call <16 x i8> @llvm.x86.sse3.ldu.dq
+; CHECK: store <16 x i8> {{.*}} @__msan_retval_tls
+; CHECK-ORIGINS: store i32 {{.*}}[[ORIGIN]], i32* @__msan_retval_origin_tls
+; CHECK: ret <16 x i8>
+
+
+; Simple NoMem intrinsic
+; Check that shadow is OR'ed, and origin is Select'ed
+; And no shadow checks!
+
+define <8 x i16> @Paddsw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable sanitize_memory {
+ %call = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b)
+ ret <8 x i16> %call
+}
+
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) nounwind
+
+; CHECK-LABEL: @Paddsw128
+; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls
+; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls
+; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls
+; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls
+; CHECK-NEXT: = or <8 x i16>
+; CHECK-ORIGINS: = bitcast <8 x i16> {{.*}} to i128
+; CHECK-ORIGINS-NEXT: = icmp ne i128 {{.*}}, 0
+; CHECK-ORIGINS-NEXT: = select i1 {{.*}}, i32 {{.*}}, i32
+; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.padds.w
+; CHECK-NEXT: store <8 x i16> {{.*}} @__msan_retval_tls
+; CHECK-ORIGINS: store i32 {{.*}} @__msan_retval_origin_tls
+; CHECK-NEXT: ret <8 x i16>
diff --git a/test/Instrumentation/MemorySanitizer/pr32842.ll b/test/Instrumentation/MemorySanitizer/pr32842.ll
new file mode 100644
index 000000000000..5d74c9a193bf
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/pr32842.ll
@@ -0,0 +1,20 @@
+; Regression test for https://bugs.llvm.org/show_bug.cgi?id=32842
+;
+; RUN: opt < %s -msan -S | FileCheck %s
+;target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define zeroext i1 @_Z1fii(i32 %x, i32 %y) sanitize_memory {
+entry:
+ %cmp = icmp slt i32 %x, %y
+ ret i1 %cmp
+}
+
+; CHECK: [[X:[^ ]+]] = load{{.*}}__msan_param_tls{{.*}}
+; CHECK: [[Y:[^ ]+]] = load{{.*}}__msan_param_tls{{.*}}
+; CHECK: [[OR:[^ ]+]] = or i32 [[Y]], [[X]]
+
+; Make sure the shadow of the (x < y) comparison isn't truncated to i1.
+; CHECK-NOT: trunc i32 [[OR]] to i1
+; CHECK: [[CMP:[^ ]+]] = icmp ne i32 [[OR]], 0
+; CHECK: store i1 [[CMP]],{{.*}}__msan_retval_tls
diff --git a/test/Instrumentation/MemorySanitizer/vector_arith.ll b/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 6541a1c3a394..8be085cff33d 100644
--- a/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -1,4 +1,5 @@
; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; REQUIRES: x86
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/MemorySanitizer/vector_cmp.ll b/test/Instrumentation/MemorySanitizer/vector_cmp.ll
index fb54a5cb632e..62a5f573064e 100644
--- a/test/Instrumentation/MemorySanitizer/vector_cmp.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_cmp.ll
@@ -1,4 +1,5 @@
; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; REQUIRES: x86
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/MemorySanitizer/vector_cvt.ll b/test/Instrumentation/MemorySanitizer/vector_cvt.ll
index 55e91c74a316..beedb0e63e50 100644
--- a/test/Instrumentation/MemorySanitizer/vector_cvt.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_cvt.ll
@@ -1,4 +1,5 @@
; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; REQUIRES: x86
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/MemorySanitizer/vector_pack.ll b/test/Instrumentation/MemorySanitizer/vector_pack.ll
index 31c0c62980ec..deb03d84802a 100644
--- a/test/Instrumentation/MemorySanitizer/vector_pack.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -1,4 +1,5 @@
; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; REQUIRES: x86
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Instrumentation/MemorySanitizer/vector_shift.ll b/test/Instrumentation/MemorySanitizer/vector_shift.ll
index 978bad3b6979..a4b8fdbd603f 100644
--- a/test/Instrumentation/MemorySanitizer/vector_shift.ll
+++ b/test/Instrumentation/MemorySanitizer/vector_shift.ll
@@ -1,4 +1,5 @@
; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; REQUIRES: x86
; Test instrumentation of vector shift instructions.
diff --git a/test/LTO/Resolution/X86/ifunc.ll b/test/LTO/Resolution/X86/ifunc.ll
new file mode 100644
index 000000000000..63723763430c
--- /dev/null
+++ b/test/LTO/Resolution/X86/ifunc.ll
@@ -0,0 +1,15 @@
+; RUN: opt -module-summary -o %t.bc %s
+; RUN: llvm-lto2 run %t.bc -r %t.bc,foo,pl -o %t2
+; RUN: llvm-nm %t2.0 | FileCheck %s
+; CHECK: T foo
+; CHECK: t foo_ifunc
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@foo = ifunc i32 (i32), i64 ()* @foo_ifunc
+
+define internal i64 @foo_ifunc() {
+entry:
+ ret i64 0
+}
diff --git a/test/MC/AArch64/directive-cpu-err.s b/test/MC/AArch64/directive-cpu-err.s
new file mode 100644
index 000000000000..ea0d28e71815
--- /dev/null
+++ b/test/MC/AArch64/directive-cpu-err.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple aarch64-linux-gnu %s 2> %t > /dev/null
+// RUN: FileCheck %s < %t
+
+ .cpu invalid
+ // CHECK: error: unknown CPU name
+
+ .cpu generic+wibble+nowobble
+ // CHECK: :[[@LINE-1]]:18: error: unsupported architectural extension
+ // CHECK: :[[@LINE-2]]:25: error: unsupported architectural extension
diff --git a/test/MC/AArch64/label-arithmetic-diags-elf.s b/test/MC/AArch64/label-arithmetic-diags-elf.s
index e9d92d591fac..dbfdd24f8dc9 100644
--- a/test/MC/AArch64/label-arithmetic-diags-elf.s
+++ b/test/MC/AArch64/label-arithmetic-diags-elf.s
@@ -1,5 +1,14 @@
// RUN: not llvm-mc -triple aarch64-elf -filetype=obj %s -o /dev/null 2>&1 | FileCheck %s
+ .data
+b:
+ .fill 300
+e:
+ .byte e - b
+ // CHECK: error: value evaluated as 300 is out of range.
+ // CHECK-NEXT: .byte e - b
+ // CHECK-NEXT: ^
+
.section sec_x
start:
.space 5000
diff --git a/test/MC/AMDGPU/flat.s b/test/MC/AMDGPU/flat.s
index c6894c35f4d7..4e81799fe9f9 100644
--- a/test/MC/AMDGPU/flat.s
+++ b/test/MC/AMDGPU/flat.s
@@ -30,31 +30,6 @@ flat_load_dword v1, v[3:4] glc slc
// CI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x00,0x01]
// VI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x53,0xdc,0x03,0x00,0x00,0x01]
-flat_load_dword v1, v[3:4] glc tfe
-// NOSI: error:
-// CI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x80,0x01]
-// VI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x51,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] glc slc tfe
-// NOSI: error:
-// CI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01]
-// VI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x53,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] slc
-// NOSI: error:
-// CI: flat_load_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x00,0x01]
-// VI: flat_load_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x52,0xdc,0x03,0x00,0x00,0x01]
-
-flat_load_dword v1, v[3:4] slc tfe
-// NOSI: error:
-// CI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x80,0x01]
-// VI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x52,0xdc,0x03,0x00,0x80,0x01]
-
-flat_load_dword v1, v[3:4] tfe
-// NOSI: error:
-// CI: flat_load_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x80,0x01]
-// VI: flat_load_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x50,0xdc,0x03,0x00,0x80,0x01]
-
flat_store_dword v[3:4], v1
// NOSI: error:
// CIVI: flat_store_dword v[3:4], v1 ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x00,0x00]
@@ -67,66 +42,25 @@ flat_store_dword v[3:4], v1 glc slc
// NOSI: error:
// CIVI: flat_store_dword v[3:4], v1 glc slc ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x00,0x00]
-flat_store_dword v[3:4], v1 glc tfe
-// NOSI: error:
-// CIVI: flat_store_dword v[3:4], v1 glc tfe ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v[3:4], v1 glc slc tfe
-// NOSI: error:
-// CIVI: flat_store_dword v[3:4], v1 glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00]
flat_store_dword v[3:4], v1 slc
// NOSI: error:
// CIVI: flat_store_dword v[3:4], v1 slc ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x00,0x00]
-flat_store_dword v[3:4], v1 slc tfe
-// NOSI: error:
-// CIVI: flat_store_dword v[3:4], v1 slc tfe ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x80,0x00]
-
-flat_store_dword v[3:4], v1 tfe
-// NOSI: error:
-// CIVI: flat_store_dword v[3:4], v1 tfe ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x80,0x00]
-
// FIXME: For atomic instructions, glc must be placed immediately following
// the data regiser. These forms aren't currently supported:
// flat_atomic_add v1, v[3:4], v5 slc glc
-// flat_atomic_add v1, v[3:4], v5 slc glc tfe
-// flat_atomic_add v1, v[3:4], v5 slc tfe glc
-// flat_atomic_add v1, v[3:4], v5 tfe glc
-// flat_atomic_add v[3:4], v5 tfe glc
-// flat_atomic_add v1, v[3:4], v5 tfe glc slc
-// flat_atomic_add v1, v[3:4], v5 tfe slc glc
flat_atomic_add v1 v[3:4], v5 glc slc
// NOSI: error:
// CI: flat_atomic_add v1, v[3:4], v5 glc slc ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x00,0x01]
// VI: flat_atomic_add v1, v[3:4], v5 glc slc ; encoding: [0x00,0x00,0x0b,0xdd,0x03,0x05,0x00,0x01]
-flat_atomic_add v1 v[3:4], v5 glc tfe
-// NOSI: error:
-// CI: flat_atomic_add v1, v[3:4], v5 glc tfe ; encoding: [0x00,0x00,0xc9,0xdc,0x03,0x05,0x80,0x01]
-// VI: flat_atomic_add v1, v[3:4], v5 glc tfe ; encoding: [0x00,0x00,0x09,0xdd,0x03,0x05,0x80,0x01]
-
-flat_atomic_add v1 v[3:4], v5 glc slc tfe
-// NOSI: error:
-// CI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x80,0x01]
-// VI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0x0b,0xdd,0x03,0x05,0x80,0x01]
-
flat_atomic_add v[3:4], v5 slc
// NOSI: error:
// CI: flat_atomic_add v[3:4], v5 slc ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x00,0x00]
// VI: flat_atomic_add v[3:4], v5 slc ; encoding: [0x00,0x00,0x0a,0xdd,0x03,0x05,0x00,0x00]
-flat_atomic_add v[3:4], v5 slc tfe
-// NOSI: error:
-// CI: flat_atomic_add v[3:4], v5 slc tfe ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x80,0x00]
-// VI: flat_atomic_add v[3:4], v5 slc tfe ; encoding: [0x00,0x00,0x0a,0xdd,0x03,0x05,0x80,0x00]
-
-flat_atomic_add v[3:4], v5 tfe
-// NOSI: error:
-// CI: flat_atomic_add v[3:4], v5 tfe ; encoding: [0x00,0x00,0xc8,0xdc,0x03,0x05,0x80,0x00]
-// VI: flat_atomic_add v[3:4], v5 tfe ; encoding: [0x00,0x00,0x08,0xdd,0x03,0x05,0x80,0x00]
-
//===----------------------------------------------------------------------===//
// Instructions
//===----------------------------------------------------------------------===//
diff --git a/test/MC/AMDGPU/literal16.s b/test/MC/AMDGPU/literal16.s
index e578ce82372f..97d16c374285 100644
--- a/test/MC/AMDGPU/literal16.s
+++ b/test/MC/AMDGPU/literal16.s
@@ -133,16 +133,16 @@ v_add_f16 v1, 65535, v2
// K-constant
v_madmk_f16 v1, v2, 0x4280, v3
-// VI: v_madmk_f16_e32 v1, v2, 0x4280, v3 ; encoding: [0x02,0x07,0x02,0x48,0x80,0x42,0x00,0x00]
+// VI: v_madmk_f16 v1, v2, 0x4280, v3 ; encoding: [0x02,0x07,0x02,0x48,0x80,0x42,0x00,0x00]
v_madmk_f16 v1, v2, 1.0, v3
-// VI: v_madmk_f16_e32 v1, v2, 0x3c00, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x3c,0x00,0x00]
+// VI: v_madmk_f16 v1, v2, 0x3c00, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x3c,0x00,0x00]
v_madmk_f16 v1, v2, 1, v3
-// VI: v_madmk_f16_e32 v1, v2, 0x1, v3 ; encoding: [0x02,0x07,0x02,0x48,0x01,0x00,0x00,0x00]
+// VI: v_madmk_f16 v1, v2, 0x1, v3 ; encoding: [0x02,0x07,0x02,0x48,0x01,0x00,0x00,0x00]
v_madmk_f16 v1, v2, 64.0, v3
-// VI: v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
+// VI: v_madmk_f16 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
v_add_f16_e32 v1, 64.0, v2
diff --git a/test/MC/AMDGPU/vop2.s b/test/MC/AMDGPU/vop2.s
index 078b68638008..79ea38e641a6 100644
--- a/test/MC/AMDGPU/vop2.s
+++ b/test/MC/AMDGPU/vop2.s
@@ -243,31 +243,31 @@ v_or_b32_e32 v1, v2, v3
v_xor_b32_e32 v1, v2, v3
// SICI: v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x3c,0xd2,0x02,0x07,0x02,0x00]
-// VI: v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_bfm_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
v_bfm_b32_e64 v1, v2, v3
// SICI: v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e]
// VI: v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c]
v_mac_f32_e32 v1, v2, v3
-// SICI: v_madmk_f32_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x40,0x00,0x00,0x80,0x42]
-// VI: v_madmk_f32_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42]
-v_madmk_f32_e32 v1, v2, 64.0, v3
+// SICI: v_madmk_f32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x40,0x00,0x00,0x80,0x42]
+// VI: v_madmk_f32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42]
+v_madmk_f32 v1, v2, 64.0, v3
-// SICI: v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x42,0x00,0x00,0x80,0x42]
-// VI: v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42]
-v_madak_f32_e32 v1, v2, v3, 64.0
+// SICI: v_madak_f32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x42,0x00,0x00,0x80,0x42]
+// VI: v_madak_f32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42]
+v_madak_f32 v1, v2, v3, 64.0
// SICI: v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x44,0xd2,0x02,0x07,0x02,0x00]
-// VI: v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_bcnt_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
v_bcnt_u32_b32_e64 v1, v2, v3
// SICI: v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x46,0xd2,0x02,0x07,0x02,0x00]
-// VI: v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_mbcnt_lo_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
v_mbcnt_lo_u32_b32_e64 v1, v2, v3
// SICI: v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x48,0xd2,0x02,0x07,0x02,0x00]
-// VI: v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_mbcnt_hi_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
v_mbcnt_hi_u32_b32_e64 v1, v2, v3
// SICI: v_add_i32_e32 v1, vcc, v2, v3 ; encoding: [0x02,0x07,0x02,0x4a]
@@ -376,31 +376,31 @@ v_subbrev_u32 v1, vcc, v2, v3, vcc
v_subbrev_u32 v1, s[0:1], v2, v3, vcc
// SICI: v_ldexp_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x56]
-// VI: v_ldexp_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_ldexp_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00]
v_ldexp_f32 v1, v2, v3
// SICI: v_cvt_pkaccum_u8_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x58]
-// VI: v_cvt_pkaccum_u8_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00]
+// VI: v_cvt_pkaccum_u8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00]
v_cvt_pkaccum_u8_f32 v1, v2, v3
// SICI: v_cvt_pknorm_i16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5a]
-// VI: v_cvt_pknorm_i16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_cvt_pknorm_i16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00]
v_cvt_pknorm_i16_f32 v1, v2, v3
// SICI: v_cvt_pknorm_u16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5c]
-// VI: v_cvt_pknorm_u16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_cvt_pknorm_u16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00]
v_cvt_pknorm_u16_f32 v1, v2, v3
// SICI: v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5e]
-// VI: v_cvt_pkrtz_f16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_cvt_pkrtz_f16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00]
v_cvt_pkrtz_f16_f32 v1, v2, v3
// SICI: v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x60,0xd2,0x02,0x07,0x02,0x00]
-// VI: v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_cvt_pk_u16_u32 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
v_cvt_pk_u16_u32_e64 v1, v2, v3
// SICI: v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x62,0xd2,0x02,0x07,0x02,0x00]
-// VI: v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_cvt_pk_i16_i32 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
v_cvt_pk_i16_i32_e64 v1, v2, v3
// NOSICI: error: instruction not supported on this GPU
@@ -430,12 +430,12 @@ v_mac_f16_e32 v1, v2, v3
// NOSICI: error: instruction not supported on this GPU
// NOSICI: v_madmk_f16 v1, v2, 64.0, v3
-// VI: v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
+// VI: v_madmk_f16 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
v_madmk_f16 v1, v2, 64.0, v3
// NOSICI: error: instruction not supported on this GPU
// NOSICI: v_madak_f16 v1, v2, v3, 64.0
-// VI: v_madak_f16_e32 v1, v2, v3, 0x5400 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x54,0x00,0x00]
+// VI: v_madak_f16 v1, v2, v3, 0x5400 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x54,0x00,0x00]
v_madak_f16 v1, v2, v3, 64.0
// NOSICI: error: instruction not supported on this GPU
diff --git a/test/MC/AMDGPU/vop3-convert.s b/test/MC/AMDGPU/vop3-convert.s
index 8bc88a08dda2..781aa672d3c4 100644
--- a/test/MC/AMDGPU/vop3-convert.s
+++ b/test/MC/AMDGPU/vop3-convert.s
@@ -288,31 +288,31 @@ v_or_b32 v1, v2, v3
v_xor_b32 v1, v2, v3
// SICI: v_bfm_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c]
-// VI: v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_bfm_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
v_bfm_b32 v1, v2, v3
// SICI: v_bcnt_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x44]
-// VI: v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_bcnt_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
v_bcnt_u32_b32 v1, v2, v3
// SICI: v_mbcnt_lo_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46]
-// VI: v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_mbcnt_lo_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
v_mbcnt_lo_u32_b32 v1, v2, v3
// SICI: v_mbcnt_hi_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x48]
-// VI: v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_mbcnt_hi_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
v_mbcnt_hi_u32_b32 v1, v2, v3
// SICI: v_cvt_pk_u16_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x60]
-// VI: v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_cvt_pk_u16_u32 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
v_cvt_pk_u16_u32 v1, v2, v3
// SICI: v_cvt_pk_i16_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x62]
-// VI: v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_cvt_pk_i16_i32 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
v_cvt_pk_i16_i32 v1, v2, v3
// SICI: v_bfm_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c]
-// VI: v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
+// VI: v_bfm_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
v_bfm_b32 v1, v2, v3
// NOSICI: error: instruction not supported on this GPU
diff --git a/test/MC/AsmParser/altmacro_string_escape.s b/test/MC/AsmParser/altmacro_string_escape.s
new file mode 100644
index 000000000000..bcc9e845953e
--- /dev/null
+++ b/test/MC/AsmParser/altmacro_string_escape.s
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -triple i386-linux-gnu %s| FileCheck %s
+
+.altmacro
+# single-character string escape
+# To include any single character literally in a string
+# (even if the character would otherwise have some special meaning),
+# you can prefix the character with `!'.
+# For example, you can write `<4.3 !> 5.4!!>' to get the literal text `4.3 > 5.4!'.
+
+# CHECK: workForFun:
+.macro fun1 number
+ .if \number=5
+ lableNotWork:
+ .else
+ workForFun:
+ .endif
+.endm
+
+# CHECK: workForFun2:
+.macro fun2 string
+ .if \string
+ workForFun2:
+ .else
+ notworkForFun2:
+ .endif
+.endm
+
+fun1 <5!!>
+fun2 <5!>4>
diff --git a/test/MC/Disassembler/AMDGPU/flat_vi.txt b/test/MC/Disassembler/AMDGPU/flat_vi.txt
index a7013092b493..bcc395078050 100644
--- a/test/MC/Disassembler/AMDGPU/flat_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/flat_vi.txt
@@ -9,39 +9,15 @@
# VI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x53,0xdc,0x03,0x00,0x00,0x01]
0x00 0x00 0x53 0xdc 0x03 0x00 0x00 0x01
-# VI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x51,0xdc,0x03,0x00,0x80,0x01]
-0x00 0x00 0x51 0xdc 0x03 0x00 0x80 0x01
-
-# VI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x53,0xdc,0x03,0x00,0x80,0x01]
-0x00 0x00 0x53 0xdc 0x03 0x00 0x80 0x01
-
# VI: flat_load_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x52,0xdc,0x03,0x00,0x00,0x01]
0x00 0x00 0x52 0xdc 0x03 0x00 0x00 0x01
-# VI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x52,0xdc,0x03,0x00,0x80,0x01]
-0x00 0x00 0x52 0xdc 0x03 0x00 0x80 0x01
-
-# VI: flat_load_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x50,0xdc,0x03,0x00,0x80,0x01]
-0x00 0x00 0x50 0xdc 0x03 0x00 0x80 0x01
-
# VI: flat_atomic_add v1, v[3:4], v5 glc slc ; encoding: [0x00,0x00,0x0b,0xdd,0x03,0x05,0x00,0x01]
0x00 0x00 0x0b 0xdd 0x03 0x05 0x00 0x01
-# VI: flat_atomic_add v1, v[3:4], v5 glc tfe ; encoding: [0x00,0x00,0x09,0xdd,0x03,0x05,0x80,0x01]
-0x00 0x00 0x09 0xdd 0x03 0x05 0x80 0x01
-
-# VI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0x0b,0xdd,0x03,0x05,0x80,0x01]
-0x00 0x00 0x0b 0xdd 0x03 0x05 0x80 0x01
-
# VI: flat_atomic_add v[3:4], v5 slc ; encoding: [0x00,0x00,0x0a,0xdd,0x03,0x05,0x00,0x00]
0x00 0x00 0x0a 0xdd 0x03 0x05 0x00 0x00
-# VI: flat_atomic_add v[3:4], v5 slc tfe ; encoding: [0x00,0x00,0x0a,0xdd,0x03,0x05,0x80,0x00]
-0x00 0x00 0x0a 0xdd 0x03 0x05 0x80 0x00
-
-# VI: flat_atomic_add v[3:4], v5 tfe ; encoding: [0x00,0x00,0x08,0xdd,0x03,0x05,0x80,0x00]
-0x00 0x00 0x08 0xdd 0x03 0x05 0x80 0x00
-
# VI: flat_load_ubyte v1, v[3:4] ; encoding: [0x00,0x00,0x40,0xdc,0x03,0x00,0x00,0x01]
0x00 0x00 0x40 0xdc 0x03 0x00 0x00 0x01
diff --git a/test/MC/Disassembler/AMDGPU/literal16_vi.txt b/test/MC/Disassembler/AMDGPU/literal16_vi.txt
index 362e87703694..a3cdae33a4cc 100644
--- a/test/MC/Disassembler/AMDGPU/literal16_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/literal16_vi.txt
@@ -44,11 +44,11 @@
# VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e]
0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00
-# VI: v_madmk_f16_e32 v1, v2, 0x41, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x00]
+# VI: v_madmk_f16 v1, v2, 0x41, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x00]
0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x00
-# VI: v_madmk_f16_e32 v1, v2, 0x10041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x01,0x00]
+# VI: v_madmk_f16 v1, v2, 0x10041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x01,0x00]
0x02 0x07 0x02 0x48 0x41 0x00 0x01 0x00
-# VI: v_madmk_f16_e32 v1, v2, 0x1000041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x01]
+# VI: v_madmk_f16 v1, v2, 0x1000041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x01]
0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x01
diff --git a/test/MC/Disassembler/AMDGPU/vop2_vi.txt b/test/MC/Disassembler/AMDGPU/vop2_vi.txt
index 4a47c8157971..b6f556bd55be 100644
--- a/test/MC/Disassembler/AMDGPU/vop2_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/vop2_vi.txt
@@ -72,25 +72,25 @@
# VI: v_xor_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2a]
0x02 0x07 0x02 0x2a
-# VI: v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
+# VI: v_bfm_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00]
0x01 0x00 0x93 0xd2 0x02 0x07 0x02 0x00
# VI: v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c]
0x02 0x07 0x02 0x2c
-# VI: v_madmk_f32_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42]
+# VI: v_madmk_f32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42]
0x02 0x07 0x02 0x2e 0x00 0x00 0x80 0x42
-# VI: v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42]
+# VI: v_madak_f32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42]
0x02 0x07 0x02 0x30 0x00 0x00 0x80 0x42
-# VI: v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
+# VI: v_bcnt_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00]
0x01 0x00 0x8b 0xd2 0x02 0x07 0x02 0x00
-# VI: v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
+# VI: v_mbcnt_lo_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00]
0x01 0x00 0x8c 0xd2 0x02 0x07 0x02 0x00
-# VI: v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
+# VI: v_mbcnt_hi_u32_b32 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00]
0x01 0x00 0x8d 0xd2 0x02 0x07 0x02 0x00
# VI: v_add_i32_e32 v1, vcc, v2, v3 ; encoding: [0x02,0x07,0x02,0x32]
@@ -171,25 +171,25 @@
# VI: v_subbrev_u32_e64 v1, s[0:1], v2, v3, vcc ; encoding: [0x01,0x00,0x1e,0xd1,0x02,0x07,0xaa,0x01]
0x01 0x00 0x1e 0xd1 0x02 0x07 0xaa 0x01
-# VI: v_ldexp_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00]
+# VI: v_ldexp_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00]
0x01 0x00 0x88 0xd2 0x02 0x07 0x02 0x00
-# VI: v_cvt_pkaccum_u8_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00]
+# VI: v_cvt_pkaccum_u8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00]
0x01 0x00 0xf0 0xd1 0x02 0x07 0x02 0x00
-# VI: v_cvt_pknorm_i16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00]
+# VI: v_cvt_pknorm_i16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00]
0x01 0x00 0x94 0xd2 0x02 0x07 0x02 0x00
-# VI: v_cvt_pknorm_u16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00]
+# VI: v_cvt_pknorm_u16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00]
0x01 0x00 0x95 0xd2 0x02 0x07 0x02 0x00
-# VI: v_cvt_pkrtz_f16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00]
+# VI: v_cvt_pkrtz_f16_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00]
0x01 0x00 0x96 0xd2 0x02 0x07 0x02 0x00
-# VI: v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
+# VI: v_cvt_pk_u16_u32 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00]
0x01 0x00 0x97 0xd2 0x02 0x07 0x02 0x00
-# VI: v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
+# VI: v_cvt_pk_i16_i32 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00]
0x01 0x00 0x98 0xd2 0x02 0x07 0x02 0x00
# VI: v_add_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e]
@@ -207,10 +207,10 @@
# VI: v_mac_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46]
0x02 0x07 0x02 0x46
-# VI: v_madmk_f16_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42]
+# VI: v_madmk_f16 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42]
0x02 0x07 0x02 0x48 0x00 0x00 0x80 0x42
-# VI: v_madak_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42]
+# VI: v_madak_f16 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42]
0x02 0x07 0x02 0x4a 0x00 0x00 0x80 0x42
# VI: v_add_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c]
diff --git a/test/MC/Disassembler/AMDGPU/vop3_vi.txt b/test/MC/Disassembler/AMDGPU/vop3_vi.txt
index c15fbaa1e3a8..a1cc1f06c3cb 100644
--- a/test/MC/Disassembler/AMDGPU/vop3_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/vop3_vi.txt
@@ -81,6 +81,24 @@
# VI: v_clrexcp ; encoding: [0x00,0x00,0x75,0xd1,0x00,0x00,0x00,0x00]
0x00 0x00 0x75 0xd1 0x00 0x00 0x00 0x00
+# VI: v_fract_f64_e64 v[5:6], s[2:3] ; encoding: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x00]
+0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x00
+
+# VI: v_fract_f64_e64 v[5:6], -4.0 ; encoding: [0x05,0x00,0x72,0xd1,0xf7,0x00,0x00,0x00]
+0x05,0x00,0x72,0xd1,0xf7,0x00,0x00,0x00
+
+# VI: v_fract_f64_e64 v[5:6], -s[2:3] ; encoding: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x20]
+0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x20
+
+# VI: v_fract_f64_e64 v[5:6], |s[2:3]| ; encoding: [0x05,0x01,0x72,0xd1,0x02,0x00,0x00,0x00]
+0x05,0x01,0x72,0xd1,0x02,0x00,0x00,0x00
+
+# VI: v_fract_f64_e64 v[5:6], s[2:3] clamp ; encoding: [0x05,0x80,0x72,0xd1,0x02,0x00,0x00,0x00]
+0x05,0x80,0x72,0xd1,0x02,0x00,0x00,0x00
+
+# VI: v_fract_f64_e64 v[5:6], s[2:3] mul:2 ; encoding: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x08]
+0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x08
+
# VI: v_fract_f32_e64 v1, -v2 ; encoding: [0x01,0x00,0x5b,0xd1,0x02,0x01,0x00,0x20]
0x01 0x00 0x5b 0xd1 0x02 0x01 0x00 0x20
diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-p9vector.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-p9vector.txt
new file mode 100644
index 000000000000..1a7964808a44
--- /dev/null
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-p9vector.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc --disassemble %s -triple powerpc64le-unknown-unknown -mcpu=pwr9 | FileCheck %s
+
+# CHECK: mtvsrdd 6, 0, 3
+0x66 0x1b 0xc0 0x7c
diff --git a/test/MC/Disassembler/SystemZ/insns-z13.txt b/test/MC/Disassembler/SystemZ/insns-z13.txt
index 5a983860df1a..4f5ec43f7348 100644
--- a/test/MC/Disassembler/SystemZ/insns-z13.txt
+++ b/test/MC/Disassembler/SystemZ/insns-z13.txt
@@ -2,6 +2,297 @@
# RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=z13 \
# RUN: | FileCheck %s
+# CHECK: lcbb %r0, 0, 0
+0xe7 0x00 0x00 0x00 0x00 0x27
+
+# CHECK: lcbb %r0, 0, 15
+0xe7 0x00 0x00 0x00 0xf0 0x27
+
+# CHECK: lcbb %r0, 4095, 0
+0xe7 0x00 0x0f 0xff 0x00 0x27
+
+# CHECK: lcbb %r0, 0(%r15), 0
+0xe7 0x00 0xf0 0x00 0x00 0x27
+
+# CHECK: lcbb %r0, 0(%r15,%r1), 0
+0xe7 0x0f 0x10 0x00 0x00 0x27
+
+# CHECK: lcbb %r15, 0, 0
+0xe7 0xf0 0x00 0x00 0x00 0x27
+
+# CHECK: lcbb %r2, 1383(%r3,%r4), 8
+0xe7 0x23 0x45 0x67 0x80 0x27
+
+# CHECK: llzrgf %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x3a
+
+# CHECK: llzrgf %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x3a
+
+# CHECK: llzrgf %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x3a
+
+# CHECK: llzrgf %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x3a
+
+# CHECK: llzrgf %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x3a
+
+# CHECK: llzrgf %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x3a
+
+# CHECK: llzrgf %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x3a
+
+# CHECK: llzrgf %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x3a
+
+# CHECK: llzrgf %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x3a
+
+# CHECK: llzrgf %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x3a
+
+# CHECK: lochi %r11, 42, 0
+0xec 0xb0 0x00 0x2a 0x00 0x42
+
+# CHECK: lochio %r11, 42
+0xec 0xb1 0x00 0x2a 0x00 0x42
+
+# CHECK: lochih %r11, 42
+0xec 0xb2 0x00 0x2a 0x00 0x42
+
+# CHECK: lochinle %r11, 42
+0xec 0xb3 0x00 0x2a 0x00 0x42
+
+# CHECK: lochil %r11, -1
+0xec 0xb4 0xff 0xff 0x00 0x42
+
+# CHECK: lochinhe %r11, 42
+0xec 0xb5 0x00 0x2a 0x00 0x42
+
+# CHECK: lochilh %r11, -1
+0xec 0xb6 0xff 0xff 0x00 0x42
+
+# CHECK: lochine %r11, 0
+0xec 0xb7 0x00 0x00 0x00 0x42
+
+# CHECK: lochie %r11, 0
+0xec 0xb8 0x00 0x00 0x00 0x42
+
+# CHECK: lochinlh %r11, 42
+0xec 0xb9 0x00 0x2a 0x00 0x42
+
+# CHECK: lochihe %r11, 255
+0xec 0xba 0x00 0xff 0x00 0x42
+
+# CHECK: lochinl %r11, 255
+0xec 0xbb 0x00 0xff 0x00 0x42
+
+# CHECK: lochile %r11, 32767
+0xec 0xbc 0x7f 0xff 0x00 0x42
+
+# CHECK: lochinh %r11, 32767
+0xec 0xbd 0x7f 0xff 0x00 0x42
+
+# CHECK: lochino %r11, 32512
+0xec 0xbe 0x7f 0x00 0x00 0x42
+
+# CHECK: lochi %r11, 32512, 15
+0xec 0xbf 0x7f 0x00 0x00 0x42
+
+# CHECK: locghi %r11, 42, 0
+0xec 0xb0 0x00 0x2a 0x00 0x46
+
+# CHECK: locghio %r11, 42
+0xec 0xb1 0x00 0x2a 0x00 0x46
+
+# CHECK: locghih %r11, 42
+0xec 0xb2 0x00 0x2a 0x00 0x46
+
+# CHECK: locghinle %r11, 42
+0xec 0xb3 0x00 0x2a 0x00 0x46
+
+# CHECK: locghil %r11, -1
+0xec 0xb4 0xff 0xff 0x00 0x46
+
+# CHECK: locghinhe %r11, 42
+0xec 0xb5 0x00 0x2a 0x00 0x46
+
+# CHECK: locghilh %r11, -1
+0xec 0xb6 0xff 0xff 0x00 0x46
+
+# CHECK: locghine %r11, 0
+0xec 0xb7 0x00 0x00 0x00 0x46
+
+# CHECK: locghie %r11, 0
+0xec 0xb8 0x00 0x00 0x00 0x46
+
+# CHECK: locghinlh %r11, 42
+0xec 0xb9 0x00 0x2a 0x00 0x46
+
+# CHECK: locghihe %r11, 255
+0xec 0xba 0x00 0xff 0x00 0x46
+
+# CHECK: locghinl %r11, 255
+0xec 0xbb 0x00 0xff 0x00 0x46
+
+# CHECK: locghile %r11, 32767
+0xec 0xbc 0x7f 0xff 0x00 0x46
+
+# CHECK: locghinh %r11, 32767
+0xec 0xbd 0x7f 0xff 0x00 0x46
+
+# CHECK: locghino %r11, 32512
+0xec 0xbe 0x7f 0x00 0x00 0x46
+
+# CHECK: locghi %r11, 32512, 15
+0xec 0xbf 0x7f 0x00 0x00 0x46
+
+# CHECK: lochhi %r11, 42, 0
+0xec 0xb0 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhio %r11, 42
+0xec 0xb1 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhih %r11, 42
+0xec 0xb2 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhinle %r11, 42
+0xec 0xb3 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhil %r11, -1
+0xec 0xb4 0xff 0xff 0x00 0x4e
+
+# CHECK: lochhinhe %r11, 42
+0xec 0xb5 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhilh %r11, -1
+0xec 0xb6 0xff 0xff 0x00 0x4e
+
+# CHECK: lochhine %r11, 0
+0xec 0xb7 0x00 0x00 0x00 0x4e
+
+# CHECK: lochhie %r11, 0
+0xec 0xb8 0x00 0x00 0x00 0x4e
+
+# CHECK: lochhinlh %r11, 42
+0xec 0xb9 0x00 0x2a 0x00 0x4e
+
+# CHECK: lochhihe %r11, 255
+0xec 0xba 0x00 0xff 0x00 0x4e
+
+# CHECK: lochhinl %r11, 255
+0xec 0xbb 0x00 0xff 0x00 0x4e
+
+# CHECK: lochhile %r11, 32767
+0xec 0xbc 0x7f 0xff 0x00 0x4e
+
+# CHECK: lochhinh %r11, 32767
+0xec 0xbd 0x7f 0xff 0x00 0x4e
+
+# CHECK: lochhino %r11, 32512
+0xec 0xbe 0x7f 0x00 0x00 0x4e
+
+# CHECK: lochhi %r11, 32512, 15
+0xec 0xbf 0x7f 0x00 0x00 0x4e
+
+# CHECK: locfh %r7, 6399(%r8), 0
+0xeb 0x70 0x88 0xff 0x01 0xe0
+
+# CHECK: locfho %r7, 6399(%r8)
+0xeb 0x71 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhh %r7, 6399(%r8)
+0xeb 0x72 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhnle %r7, 6399(%r8)
+0xeb 0x73 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhl %r7, 6399(%r8)
+0xeb 0x74 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhnhe %r7, 6399(%r8)
+0xeb 0x75 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhlh %r7, 6399(%r8)
+0xeb 0x76 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhne %r7, 6399(%r8)
+0xeb 0x77 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhe %r7, 6399(%r8)
+0xeb 0x78 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhnlh %r7, 6399(%r8)
+0xeb 0x79 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhhe %r7, 6399(%r8)
+0xeb 0x7a 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhnl %r7, 6399(%r8)
+0xeb 0x7b 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhle %r7, 6399(%r8)
+0xeb 0x7c 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhnh %r7, 6399(%r8)
+0xeb 0x7d 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhno %r7, 6399(%r8)
+0xeb 0x7e 0x88 0xff 0x01 0xe0
+
+# CHECK: locfh %r7, 6399(%r8), 15
+0xeb 0x7f 0x88 0xff 0x01 0xe0
+
+# CHECK: locfhr %r11, %r3, 0
+0xb9 0xe0 0x00 0xb3
+
+# CHECK: locfhro %r11, %r3
+0xb9 0xe0 0x10 0xb3
+
+# CHECK: locfhrh %r11, %r3
+0xb9 0xe0 0x20 0xb3
+
+# CHECK: locfhrnle %r11, %r3
+0xb9 0xe0 0x30 0xb3
+
+# CHECK: locfhrl %r11, %r3
+0xb9 0xe0 0x40 0xb3
+
+# CHECK: locfhrnhe %r11, %r3
+0xb9 0xe0 0x50 0xb3
+
+# CHECK: locfhrlh %r11, %r3
+0xb9 0xe0 0x60 0xb3
+
+# CHECK: locfhrne %r11, %r3
+0xb9 0xe0 0x70 0xb3
+
+# CHECK: locfhre %r11, %r3
+0xb9 0xe0 0x80 0xb3
+
+# CHECK: locfhrnlh %r11, %r3
+0xb9 0xe0 0x90 0xb3
+
+# CHECK: locfhrhe %r11, %r3
+0xb9 0xe0 0xa0 0xb3
+
+# CHECK: locfhrnl %r11, %r3
+0xb9 0xe0 0xb0 0xb3
+
+# CHECK: locfhrle %r11, %r3
+0xb9 0xe0 0xc0 0xb3
+
+# CHECK: locfhrnh %r11, %r3
+0xb9 0xe0 0xd0 0xb3
+
+# CHECK: locfhrno %r11, %r3
+0xb9 0xe0 0xe0 0xb3
+
+# CHECK: locfhr %r11, %r3, 15
+0xb9 0xe0 0xf0 0xb3
+
# CHECK: lzrf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x3b
@@ -62,4548 +353,4299 @@
# CHECK: lzrg %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x2a
-# CHECK: llzrgf %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x3a
+# CHECK: ppno %r2, %r10
+0xb9 0x3c 0x00 0x2a
-# CHECK: llzrgf %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x3a
+# CHECK: ppno %r2, %r14
+0xb9 0x3c 0x00 0x2e
-# CHECK: llzrgf %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x3a
+# CHECK: ppno %r14, %r2
+0xb9 0x3c 0x00 0xe2
-# CHECK: llzrgf %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x3a
+# CHECK: ppno %r14, %r10
+0xb9 0x3c 0x00 0xea
-# CHECK: llzrgf %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x3a
+# CHECK: stocfh %r0, 0, 0
+0xeb 0x00 0x00 0x00 0x00 0xe1
-# CHECK: llzrgf %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x3a
+# CHECK: stocfh %r0, 0, 15
+0xeb 0x0f 0x00 0x00 0x00 0xe1
-# CHECK: llzrgf %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x3a
+# CHECK: stocfh %r0, -524288, 0
+0xeb 0x00 0x00 0x00 0x80 0xe1
-# CHECK: llzrgf %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x3a
+# CHECK: stocfh %r0, 524287, 0
+0xeb 0x00 0x0f 0xff 0x7f 0xe1
-# CHECK: llzrgf %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x3a
+# CHECK: stocfh %r0, 0(%r1), 0
+0xeb 0x00 0x10 0x00 0x00 0xe1
-# CHECK: llzrgf %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x3a
+# CHECK: stocfh %r0, 0(%r15), 0
+0xeb 0x00 0xf0 0x00 0x00 0xe1
-#CHECK: lcbb %r0, 0, 0
-0xe7 0x00 0x00 0x00 0x00 0x27
+# CHECK: stocfh %r15, 0, 0
+0xeb 0xf0 0x00 0x00 0x00 0xe1
+
+# CHECK: stocfho %r1, 2(%r3)
+0xeb 0x11 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhh %r1, 2(%r3)
+0xeb 0x12 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhnle %r1, 2(%r3)
+0xeb 0x13 0x30 0x02 0x00 0xe1
-#CHECK: lcbb %r1, 2475(%r7,%r8), 12
-0xe7 0x17 0x89 0xab 0xc0 0x27
+# CHECK: stocfhl %r1, 2(%r3)
+0xeb 0x14 0x30 0x02 0x00 0xe1
-#CHECK: lcbb %r15, 4095(%r15,%r15), 15
-0xe7 0xff 0xff 0xff 0xf0 0x27
+# CHECK: stocfhnhe %r1, 2(%r3)
+0xeb 0x15 0x30 0x02 0x00 0xe1
-#CHECK: va %v0, %v0, %v0, 11
+# CHECK: stocfhlh %r1, 2(%r3)
+0xeb 0x16 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhne %r1, 2(%r3)
+0xeb 0x17 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhe %r1, 2(%r3)
+0xeb 0x18 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhnlh %r1, 2(%r3)
+0xeb 0x19 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhhe %r1, 2(%r3)
+0xeb 0x1a 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhnl %r1, 2(%r3)
+0xeb 0x1b 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhle %r1, 2(%r3)
+0xeb 0x1c 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhnh %r1, 2(%r3)
+0xeb 0x1d 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfhno %r1, 2(%r3)
+0xeb 0x1e 0x30 0x02 0x00 0xe1
+
+# CHECK: stocfh %r1, 2(%r3), 15
+0xeb 0x1f 0x30 0x02 0x00 0xe1
+
+# CHECK: va %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xf3
-#CHECK: va %v18, %v3, %v20, 11
+# CHECK: va %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xf3
-#CHECK: va %v31, %v31, %v31, 11
+# CHECK: va %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xf3
-#CHECK: vab %v0, %v0, %v0
+# CHECK: vab %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xf3
-#CHECK: vab %v18, %v3, %v20
+# CHECK: vab %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xf3
-#CHECK: vab %v31, %v31, %v31
+# CHECK: vab %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xf3
-#CHECK: vacc %v0, %v0, %v0, 11
+# CHECK: vac %v0, %v0, %v0, %v0, 11
+0xe7 0x00 0x0b 0x00 0x00 0xbb
+
+# CHECK: vac %v3, %v20, %v5, %v22, 11
+0xe7 0x34 0x5b 0x00 0x65 0xbb
+
+# CHECK: vac %v31, %v31, %v31, %v31, 11
+0xe7 0xff 0xfb 0x00 0xff 0xbb
+
+# CHECK: vacc %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xf1
-#CHECK: vacc %v18, %v3, %v20, 11
+# CHECK: vacc %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xf1
-#CHECK: vacc %v31, %v31, %v31, 11
+# CHECK: vacc %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xf1
-#CHECK: vaccb %v0, %v0, %v0
+# CHECK: vaccb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xf1
-#CHECK: vaccb %v18, %v3, %v20
+# CHECK: vaccb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xf1
-#CHECK: vaccb %v31, %v31, %v31
+# CHECK: vaccb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xf1
-#CHECK: vaccc %v0, %v0, %v0, %v0, 11
+# CHECK: vaccc %v0, %v0, %v0, %v0, 11
0xe7 0x00 0x0b 0x00 0x00 0xb9
-#CHECK: vaccc %v3, %v20, %v5, %v22, 11
+# CHECK: vaccc %v3, %v20, %v5, %v22, 11
0xe7 0x34 0x5b 0x00 0x65 0xb9
-#CHECK: vaccc %v31, %v31, %v31, %v31, 11
+# CHECK: vaccc %v31, %v31, %v31, %v31, 11
0xe7 0xff 0xfb 0x00 0xff 0xb9
-#CHECK: vacccq %v0, %v0, %v0, %v0
+# CHECK: vacccq %v0, %v0, %v0, %v0
0xe7 0x00 0x04 0x00 0x00 0xb9
-#CHECK: vacccq %v3, %v20, %v5, %v22
+# CHECK: vacccq %v3, %v20, %v5, %v22
0xe7 0x34 0x54 0x00 0x65 0xb9
-#CHECK: vacccq %v31, %v31, %v31, %v31
+# CHECK: vacccq %v31, %v31, %v31, %v31
0xe7 0xff 0xf4 0x00 0xff 0xb9
-#CHECK: vaccf %v0, %v0, %v0
+# CHECK: vaccf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xf1
-#CHECK: vaccf %v18, %v3, %v20
+# CHECK: vaccf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xf1
-#CHECK: vaccf %v31, %v31, %v31
+# CHECK: vaccf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xf1
-#CHECK: vaccg %v0, %v0, %v0
+# CHECK: vaccg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xf1
-#CHECK: vaccg %v18, %v3, %v20
+# CHECK: vaccg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xf1
-#CHECK: vaccg %v31, %v31, %v31
+# CHECK: vaccg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xf1
-#CHECK: vacch %v0, %v0, %v0
+# CHECK: vacch %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xf1
-#CHECK: vacch %v18, %v3, %v20
+# CHECK: vacch %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xf1
-#CHECK: vacch %v31, %v31, %v31
+# CHECK: vacch %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xf1
-#CHECK: vaccq %v0, %v0, %v0
+# CHECK: vaccq %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x40 0xf1
-#CHECK: vaccq %v18, %v3, %v20
+# CHECK: vaccq %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x4a 0xf1
-#CHECK: vaccq %v31, %v31, %v31
+# CHECK: vaccq %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x4e 0xf1
-#CHECK: vac %v0, %v0, %v0, %v0, 11
-0xe7 0x00 0x0b 0x00 0x00 0xbb
-
-#CHECK: vac %v3, %v20, %v5, %v22, 11
-0xe7 0x34 0x5b 0x00 0x65 0xbb
-
-#CHECK: vac %v31, %v31, %v31, %v31, 11
-0xe7 0xff 0xfb 0x00 0xff 0xbb
-
-#CHECK: vacq %v0, %v0, %v0, %v0
+# CHECK: vacq %v0, %v0, %v0, %v0
0xe7 0x00 0x04 0x00 0x00 0xbb
-#CHECK: vacq %v3, %v20, %v5, %v22
+# CHECK: vacq %v3, %v20, %v5, %v22
0xe7 0x34 0x54 0x00 0x65 0xbb
-#CHECK: vacq %v31, %v31, %v31, %v31
+# CHECK: vacq %v31, %v31, %v31, %v31
0xe7 0xff 0xf4 0x00 0xff 0xbb
-#CHECK: vaf %v0, %v0, %v0
+# CHECK: vaf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xf3
-#CHECK: vaf %v18, %v3, %v20
+# CHECK: vaf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xf3
-#CHECK: vaf %v31, %v31, %v31
+# CHECK: vaf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xf3
-#CHECK: vag %v0, %v0, %v0
+# CHECK: vag %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xf3
-#CHECK: vag %v18, %v3, %v20
+# CHECK: vag %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xf3
-#CHECK: vag %v31, %v31, %v31
+# CHECK: vag %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xf3
-#CHECK: vah %v0, %v0, %v0
+# CHECK: vah %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xf3
-#CHECK: vah %v18, %v3, %v20
+# CHECK: vah %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xf3
-#CHECK: vah %v31, %v31, %v31
+# CHECK: vah %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xf3
-#CHECK: vaq %v0, %v0, %v0
+# CHECK: vaq %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x40 0xf3
-#CHECK: vaq %v18, %v3, %v20
+# CHECK: vaq %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x4a 0xf3
-#CHECK: vaq %v31, %v31, %v31
+# CHECK: vaq %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x4e 0xf3
-#CHECK: vavg %v0, %v0, %v0, 11
+# CHECK: vavg %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xf2
-#CHECK: vavg %v18, %v3, %v20, 11
+# CHECK: vavg %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xf2
-#CHECK: vavg %v31, %v31, %v31, 11
+# CHECK: vavg %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xf2
-#CHECK: vavgb %v0, %v0, %v0
+# CHECK: vavgb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xf2
-#CHECK: vavgb %v18, %v3, %v20
+# CHECK: vavgb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xf2
-#CHECK: vavgb %v31, %v31, %v31
+# CHECK: vavgb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xf2
-#CHECK: vavgf %v0, %v0, %v0
+# CHECK: vavgf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xf2
-#CHECK: vavgf %v18, %v3, %v20
+# CHECK: vavgf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xf2
-#CHECK: vavgf %v31, %v31, %v31
+# CHECK: vavgf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xf2
-#CHECK: vavgg %v0, %v0, %v0
+# CHECK: vavgg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xf2
-#CHECK: vavgg %v18, %v3, %v20
+# CHECK: vavgg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xf2
-#CHECK: vavgg %v31, %v31, %v31
+# CHECK: vavgg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xf2
-#CHECK: vavgh %v0, %v0, %v0
+# CHECK: vavgh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xf2
-#CHECK: vavgh %v18, %v3, %v20
+# CHECK: vavgh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xf2
-#CHECK: vavgh %v31, %v31, %v31
+# CHECK: vavgh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xf2
-#CHECK: vavgl %v0, %v0, %v0, 11
+# CHECK: vavgl %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xf0
-#CHECK: vavgl %v18, %v3, %v20, 11
+# CHECK: vavgl %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xf0
-#CHECK: vavgl %v31, %v31, %v31, 11
+# CHECK: vavgl %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xf0
-#CHECK: vavglb %v0, %v0, %v0
+# CHECK: vavglb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xf0
-#CHECK: vavglb %v18, %v3, %v20
+# CHECK: vavglb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xf0
-#CHECK: vavglb %v31, %v31, %v31
+# CHECK: vavglb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xf0
-#CHECK: vavglf %v0, %v0, %v0
+# CHECK: vavglf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xf0
-#CHECK: vavglf %v18, %v3, %v20
+# CHECK: vavglf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xf0
-#CHECK: vavglf %v31, %v31, %v31
+# CHECK: vavglf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xf0
-#CHECK: vavglg %v0, %v0, %v0
+# CHECK: vavglg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xf0
-#CHECK: vavglg %v18, %v3, %v20
+# CHECK: vavglg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xf0
-#CHECK: vavglg %v31, %v31, %v31
+# CHECK: vavglg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xf0
-#CHECK: vavglh %v0, %v0, %v0
+# CHECK: vavglh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xf0
-#CHECK: vavglh %v18, %v3, %v20
+# CHECK: vavglh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xf0
-#CHECK: vavglh %v31, %v31, %v31
+# CHECK: vavglh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xf0
-#CHECK: vcdg %v0, %v0, 11, 0, 0
+# CHECK: vcdg %v0, %v0, 11, 0, 0
0xe7 0x00 0x00 0x00 0xb0 0xc3
-#CHECK: vcdg %v19, %v14, 11, 4, 10
+# CHECK: vcdg %v19, %v14, 11, 4, 10
0xe7 0x3e 0x00 0xa4 0xb8 0xc3
-#CHECK: vcdg %v31, %v31, 11, 7, 15
+# CHECK: vcdg %v31, %v31, 11, 7, 15
0xe7 0xff 0x00 0xf7 0xbc 0xc3
-#CHECK: vcdgb %v0, %v0, 0, 0
+# CHECK: vcdgb %v0, %v0, 0, 0
0xe7 0x00 0x00 0x00 0x30 0xc3
-#CHECK: vcdgb %v19, %v14, 4, 10
+# CHECK: vcdgb %v19, %v14, 4, 10
0xe7 0x3e 0x00 0xa4 0x38 0xc3
-#CHECK: vcdgb %v31, %v31, 7, 15
+# CHECK: vcdgb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xf7 0x3c 0xc3
-#CHECK: vcdlg %v0, %v0, 11, 0, 0
+# CHECK: vcdlg %v0, %v0, 11, 0, 0
0xe7 0x00 0x00 0x00 0xb0 0xc1
-#CHECK: vcdlg %v19, %v14, 11, 4, 10
+# CHECK: vcdlg %v19, %v14, 11, 4, 10
0xe7 0x3e 0x00 0xa4 0xb8 0xc1
-#CHECK: vcdlg %v31, %v31, 11, 7, 15
+# CHECK: vcdlg %v31, %v31, 11, 7, 15
0xe7 0xff 0x00 0xf7 0xbc 0xc1
-#CHECK: vcdlgb %v0, %v0, 0, 0
+# CHECK: vcdlgb %v0, %v0, 0, 0
0xe7 0x00 0x00 0x00 0x30 0xc1
-#CHECK: vcdlgb %v19, %v14, 4, 10
+# CHECK: vcdlgb %v19, %v14, 4, 10
0xe7 0x3e 0x00 0xa4 0x38 0xc1
-#CHECK: vcdlgb %v31, %v31, 7, 15
+# CHECK: vcdlgb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xf7 0x3c 0xc1
-#CHECK: vceq %v0, %v0, %v0, 11, 9
+# CHECK: vceq %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x90 0xb0 0xf8
-#CHECK: vceq %v18, %v3, %v20, 11, 9
+# CHECK: vceq %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x90 0xba 0xf8
-#CHECK: vceq %v7, %v24, %v9, 11, 9
+# CHECK: vceq %v7, %v24, %v9, 11, 9
0xe7 0x78 0x90 0x90 0xb4 0xf8
-#CHECK: vceq %v31, %v31, %v31, 11, 9
+# CHECK: vceq %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x90 0xbe 0xf8
-#CHECK: vceqb %v0, %v0, %v0
+# CHECK: vceqb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xf8
-#CHECK: vceqb %v18, %v3, %v20
+# CHECK: vceqb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xf8
-#CHECK: vceqbs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x04 0xf8
-
-#CHECK: vceqb %v31, %v31, %v31
+# CHECK: vceqb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xf8
-#CHECK: vceqf %v0, %v0, %v0
+# CHECK: vceqbs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x04 0xf8
+
+# CHECK: vceqf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xf8
-#CHECK: vceqf %v18, %v3, %v20
+# CHECK: vceqf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xf8
-#CHECK: vceqfs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x24 0xf8
-
-#CHECK: vceqf %v31, %v31, %v31
+# CHECK: vceqf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xf8
-#CHECK: vceqg %v0, %v0, %v0
+# CHECK: vceqfs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x24 0xf8
+
+# CHECK: vceqg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xf8
-#CHECK: vceqg %v18, %v3, %v20
+# CHECK: vceqg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xf8
-#CHECK: vceqgs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x34 0xf8
-
-#CHECK: vceqg %v31, %v31, %v31
+# CHECK: vceqg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xf8
-#CHECK: vceqh %v0, %v0, %v0
+# CHECK: vceqgs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x34 0xf8
+
+# CHECK: vceqh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xf8
-#CHECK: vceqh %v18, %v3, %v20
+# CHECK: vceqh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xf8
-#CHECK: vceqhs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x14 0xf8
-
-#CHECK: vceqh %v31, %v31, %v31
+# CHECK: vceqh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xf8
-#CHECK: vcgd %v0, %v0, 11, 0, 0
+# CHECK: vceqhs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x14 0xf8
+
+# CHECK: vcgd %v0, %v0, 11, 0, 0
0xe7 0x00 0x00 0x00 0xb0 0xc2
-#CHECK: vcgd %v19, %v14, 11, 4, 10
+# CHECK: vcgd %v19, %v14, 11, 4, 10
0xe7 0x3e 0x00 0xa4 0xb8 0xc2
-#CHECK: vcgd %v31, %v31, 11, 7, 15
+# CHECK: vcgd %v31, %v31, 11, 7, 15
0xe7 0xff 0x00 0xf7 0xbc 0xc2
-#CHECK: vcgdb %v0, %v0, 0, 0
+# CHECK: vcgdb %v0, %v0, 0, 0
0xe7 0x00 0x00 0x00 0x30 0xc2
-#CHECK: vcgdb %v19, %v14, 4, 10
+# CHECK: vcgdb %v19, %v14, 4, 10
0xe7 0x3e 0x00 0xa4 0x38 0xc2
-#CHECK: vcgdb %v31, %v31, 7, 15
+# CHECK: vcgdb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xf7 0x3c 0xc2
-#CHECK: vch %v0, %v0, %v0, 11, 9
+# CHECK: vch %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x90 0xb0 0xfb
-#CHECK: vch %v18, %v3, %v20, 11, 9
+# CHECK: vch %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x90 0xba 0xfb
-#CHECK: vch %v7, %v24, %v9, 11, 9
+# CHECK: vch %v7, %v24, %v9, 11, 9
0xe7 0x78 0x90 0x90 0xb4 0xfb
-#CHECK: vch %v31, %v31, %v31, 11, 9
+# CHECK: vch %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x90 0xbe 0xfb
-#CHECK: vchb %v0, %v0, %v0
+# CHECK: vchb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xfb
-#CHECK: vchb %v18, %v3, %v20
+# CHECK: vchb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xfb
-#CHECK: vchbs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x04 0xfb
-
-#CHECK: vchb %v31, %v31, %v31
+# CHECK: vchb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xfb
-#CHECK: vchf %v0, %v0, %v0
+# CHECK: vchbs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x04 0xfb
+
+# CHECK: vchf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xfb
-#CHECK: vchf %v18, %v3, %v20
+# CHECK: vchf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xfb
-#CHECK: vchfs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x24 0xfb
-
-#CHECK: vchf %v31, %v31, %v31
+# CHECK: vchf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xfb
-#CHECK: vchg %v0, %v0, %v0
+# CHECK: vchfs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x24 0xfb
+
+# CHECK: vchg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xfb
-#CHECK: vchg %v18, %v3, %v20
+# CHECK: vchg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xfb
-#CHECK: vchgs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x34 0xfb
-
-#CHECK: vchg %v31, %v31, %v31
+# CHECK: vchg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xfb
-#CHECK: vchh %v0, %v0, %v0
+# CHECK: vchgs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x34 0xfb
+
+# CHECK: vchh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xfb
-#CHECK: vchh %v18, %v3, %v20
+# CHECK: vchh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xfb
-#CHECK: vchhs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x14 0xfb
-
-#CHECK: vchh %v31, %v31, %v31
+# CHECK: vchh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xfb
-#CHECK: vchl %v0, %v0, %v0, 11, 9
+# CHECK: vchhs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x14 0xfb
+
+# CHECK: vchl %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x90 0xb0 0xf9
-#CHECK: vchl %v18, %v3, %v20, 11, 9
+# CHECK: vchl %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x90 0xba 0xf9
-#CHECK: vchl %v7, %v24, %v9, 11, 9
+# CHECK: vchl %v7, %v24, %v9, 11, 9
0xe7 0x78 0x90 0x90 0xb4 0xf9
-#CHECK: vchl %v31, %v31, %v31, 11, 9
+# CHECK: vchl %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x90 0xbe 0xf9
-#CHECK: vchlb %v0, %v0, %v0
+# CHECK: vchlb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xf9
-#CHECK: vchlb %v18, %v3, %v20
+# CHECK: vchlb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xf9
-#CHECK: vchlbs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x04 0xf9
-
-#CHECK: vchlb %v31, %v31, %v31
+# CHECK: vchlb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xf9
-#CHECK: vchlf %v0, %v0, %v0
+# CHECK: vchlbs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x04 0xf9
+
+# CHECK: vchlf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xf9
-#CHECK: vchlf %v18, %v3, %v20
+# CHECK: vchlf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xf9
-#CHECK: vchlfs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x24 0xf9
-
-#CHECK: vchlf %v31, %v31, %v31
+# CHECK: vchlf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xf9
-#CHECK: vchlg %v0, %v0, %v0
+# CHECK: vchlfs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x24 0xf9
+
+# CHECK: vchlg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xf9
-#CHECK: vchlg %v18, %v3, %v20
+# CHECK: vchlg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xf9
-#CHECK: vchlgs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x34 0xf9
-
-#CHECK: vchlg %v31, %v31, %v31
+# CHECK: vchlg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xf9
-#CHECK: vchlh %v0, %v0, %v0
+# CHECK: vchlgs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x34 0xf9
+
+# CHECK: vchlh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xf9
-#CHECK: vchlh %v18, %v3, %v20
+# CHECK: vchlh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xf9
-#CHECK: vchlhs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x14 0xf9
-
-#CHECK: vchlh %v31, %v31, %v31
+# CHECK: vchlh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xf9
-#CHECK: vcksm %v0, %v0, %v0
+# CHECK: vchlhs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x14 0xf9
+
+# CHECK: vcksm %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x66
-#CHECK: vcksm %v18, %v3, %v20
+# CHECK: vcksm %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x66
-#CHECK: vcksm %v31, %v31, %v31
+# CHECK: vcksm %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x66
-#CHECK: vclgd %v0, %v0, 11, 0, 0
+# CHECK: vclgd %v0, %v0, 11, 0, 0
0xe7 0x00 0x00 0x00 0xb0 0xc0
-#CHECK: vclgd %v19, %v14, 11, 4, 10
+# CHECK: vclgd %v19, %v14, 11, 4, 10
0xe7 0x3e 0x00 0xa4 0xb8 0xc0
-#CHECK: vclgd %v31, %v31, 11, 7, 15
+# CHECK: vclgd %v31, %v31, 11, 7, 15
0xe7 0xff 0x00 0xf7 0xbc 0xc0
-#CHECK: vclgdb %v0, %v0, 0, 0
+# CHECK: vclgdb %v0, %v0, 0, 0
0xe7 0x00 0x00 0x00 0x30 0xc0
-#CHECK: vclgdb %v19, %v14, 4, 10
+# CHECK: vclgdb %v19, %v14, 4, 10
0xe7 0x3e 0x00 0xa4 0x38 0xc0
-#CHECK: vclgdb %v31, %v31, 7, 15
+# CHECK: vclgdb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xf7 0x3c 0xc0
-#CHECK: vclz %v0, %v0, 11
+# CHECK: vclz %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0x53
-#CHECK: vclz %v19, %v14, 11
+# CHECK: vclz %v19, %v14, 11
0xe7 0x3e 0x00 0x00 0xb8 0x53
-#CHECK: vclz %v31, %v31, 11
+# CHECK: vclz %v31, %v31, 11
0xe7 0xff 0x00 0x00 0xbc 0x53
-#CHECK: vclzb %v0, %v0
+# CHECK: vclzb %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x53
-#CHECK: vclzb %v19, %v14
+# CHECK: vclzb %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0x53
-#CHECK: vclzb %v31, %v31
+# CHECK: vclzb %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0x53
-#CHECK: vclzf %v0, %v0
+# CHECK: vclzf %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0x53
-#CHECK: vclzf %v19, %v14
+# CHECK: vclzf %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0x53
-#CHECK: vclzf %v31, %v31
+# CHECK: vclzf %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0x53
-#CHECK: vclzg %v0, %v0
+# CHECK: vclzg %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0x53
-#CHECK: vclzg %v19, %v14
+# CHECK: vclzg %v19, %v14
0xe7 0x3e 0x00 0x00 0x38 0x53
-#CHECK: vclzg %v31, %v31
+# CHECK: vclzg %v31, %v31
0xe7 0xff 0x00 0x00 0x3c 0x53
-#CHECK: vclzh %v0, %v0
+# CHECK: vclzh %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0x53
-#CHECK: vclzh %v19, %v14
+# CHECK: vclzh %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0x53
-#CHECK: vclzh %v31, %v31
+# CHECK: vclzh %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0x53
-#CHECK: vctz %v0, %v0, 11
+# CHECK: vctz %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0x52
-#CHECK: vctz %v19, %v14, 11
+# CHECK: vctz %v19, %v14, 11
0xe7 0x3e 0x00 0x00 0xb8 0x52
-#CHECK: vctz %v31, %v31, 11
+# CHECK: vctz %v31, %v31, 11
0xe7 0xff 0x00 0x00 0xbc 0x52
-#CHECK: vctzb %v0, %v0
+# CHECK: vctzb %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x52
-#CHECK: vctzb %v19, %v14
+# CHECK: vctzb %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0x52
-#CHECK: vctzb %v31, %v31
+# CHECK: vctzb %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0x52
-#CHECK: vctzf %v0, %v0
+# CHECK: vctzf %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0x52
-#CHECK: vctzf %v19, %v14
+# CHECK: vctzf %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0x52
-#CHECK: vctzf %v31, %v31
+# CHECK: vctzf %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0x52
-#CHECK: vctzg %v0, %v0
+# CHECK: vctzg %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0x52
-#CHECK: vctzg %v19, %v14
+# CHECK: vctzg %v19, %v14
0xe7 0x3e 0x00 0x00 0x38 0x52
-#CHECK: vctzg %v31, %v31
+# CHECK: vctzg %v31, %v31
0xe7 0xff 0x00 0x00 0x3c 0x52
-#CHECK: vctzh %v0, %v0
+# CHECK: vctzh %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0x52
-#CHECK: vctzh %v19, %v14
+# CHECK: vctzh %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0x52
-#CHECK: vctzh %v31, %v31
+# CHECK: vctzh %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0x52
-#CHECK: vec %v0, %v0, 11
+# CHECK: vec %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xdb
-#CHECK: vec %v19, %v14, 11
+# CHECK: vec %v19, %v14, 11
0xe7 0x3e 0x00 0x00 0xb8 0xdb
-#CHECK: vec %v31, %v31, 11
+# CHECK: vec %v31, %v31, 11
0xe7 0xff 0x00 0x00 0xbc 0xdb
-#CHECK: vecb %v0, %v0
+# CHECK: vecb %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xdb
-#CHECK: vecb %v19, %v14
+# CHECK: vecb %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0xdb
-#CHECK: vecb %v31, %v31
+# CHECK: vecb %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0xdb
-#CHECK: vecf %v0, %v0
+# CHECK: vecf %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xdb
-#CHECK: vecf %v19, %v14
+# CHECK: vecf %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0xdb
-#CHECK: vecf %v31, %v31
+# CHECK: vecf %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0xdb
-#CHECK: vecg %v0, %v0
+# CHECK: vecg %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xdb
-#CHECK: vecg %v19, %v14
+# CHECK: vecg %v19, %v14
0xe7 0x3e 0x00 0x00 0x38 0xdb
-#CHECK: vecg %v31, %v31
+# CHECK: vecg %v31, %v31
0xe7 0xff 0x00 0x00 0x3c 0xdb
-#CHECK: vech %v0, %v0
+# CHECK: vech %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xdb
-#CHECK: vech %v19, %v14
+# CHECK: vech %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0xdb
-#CHECK: vech %v31, %v31
+# CHECK: vech %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0xdb
-#CHECK: vecl %v0, %v0, 11
+# CHECK: vecl %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xd9
-#CHECK: vecl %v19, %v14, 11
+# CHECK: vecl %v19, %v14, 11
0xe7 0x3e 0x00 0x00 0xb8 0xd9
-#CHECK: vecl %v31, %v31, 11
+# CHECK: vecl %v31, %v31, 11
0xe7 0xff 0x00 0x00 0xbc 0xd9
-#CHECK: veclb %v0, %v0
+# CHECK: veclb %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xd9
-#CHECK: veclb %v19, %v14
+# CHECK: veclb %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0xd9
-#CHECK: veclb %v31, %v31
+# CHECK: veclb %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0xd9
-#CHECK: veclf %v0, %v0
+# CHECK: veclf %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xd9
-#CHECK: veclf %v19, %v14
+# CHECK: veclf %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0xd9
-#CHECK: veclf %v31, %v31
+# CHECK: veclf %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0xd9
-#CHECK: veclg %v0, %v0
+# CHECK: veclg %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xd9
-#CHECK: veclg %v19, %v14
+# CHECK: veclg %v19, %v14
0xe7 0x3e 0x00 0x00 0x38 0xd9
-#CHECK: veclg %v31, %v31
+# CHECK: veclg %v31, %v31
0xe7 0xff 0x00 0x00 0x3c 0xd9
-#CHECK: veclh %v0, %v0
+# CHECK: veclh %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xd9
-#CHECK: veclh %v19, %v14
+# CHECK: veclh %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0xd9
-#CHECK: veclh %v31, %v31
+# CHECK: veclh %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0xd9
-#CHECK: verim %v0, %v0, %v0, 0, 11
+# CHECK: verim %v0, %v0, %v0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x72
-#CHECK: verim %v3, %v20, %v5, 103, 11
+# CHECK: verim %v3, %v20, %v5, 103, 11
0xe7 0x34 0x50 0x67 0xb4 0x72
-#CHECK: verim %v31, %v31, %v31, 255, 11
+# CHECK: verim %v31, %v31, %v31, 255, 11
0xe7 0xff 0xf0 0xff 0xbe 0x72
-#CHECK: verimb %v0, %v0, %v0, 0
+# CHECK: verimb %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x72
-#CHECK: verimb %v3, %v20, %v5, 103
+# CHECK: verimb %v3, %v20, %v5, 103
0xe7 0x34 0x50 0x67 0x04 0x72
-#CHECK: verimb %v31, %v31, %v31, 255
+# CHECK: verimb %v31, %v31, %v31, 255
0xe7 0xff 0xf0 0xff 0x0e 0x72
-#CHECK: verimf %v0, %v0, %v0, 0
+# CHECK: verimf %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x72
-#CHECK: verimf %v3, %v20, %v5, 103
+# CHECK: verimf %v3, %v20, %v5, 103
0xe7 0x34 0x50 0x67 0x24 0x72
-#CHECK: verimf %v31, %v31, %v31, 255
+# CHECK: verimf %v31, %v31, %v31, 255
0xe7 0xff 0xf0 0xff 0x2e 0x72
-#CHECK: verimg %v0, %v0, %v0, 0
+# CHECK: verimg %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x30 0x72
-#CHECK: verimg %v3, %v20, %v5, 103
+# CHECK: verimg %v3, %v20, %v5, 103
0xe7 0x34 0x50 0x67 0x34 0x72
-#CHECK: verimg %v31, %v31, %v31, 255
+# CHECK: verimg %v31, %v31, %v31, 255
0xe7 0xff 0xf0 0xff 0x3e 0x72
-#CHECK: verimh %v0, %v0, %v0, 0
+# CHECK: verimh %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x72
-#CHECK: verimh %v3, %v20, %v5, 103
+# CHECK: verimh %v3, %v20, %v5, 103
0xe7 0x34 0x50 0x67 0x14 0x72
-#CHECK: verimh %v31, %v31, %v31, 255
+# CHECK: verimh %v31, %v31, %v31, 255
0xe7 0xff 0xf0 0xff 0x1e 0x72
-#CHECK: verllv %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0x73
-
-#CHECK: verllv %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0x73
-
-#CHECK: verllv %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0x73
-
-#CHECK: verllvb %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0x73
-
-#CHECK: verllvb %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x0a 0x73
-
-#CHECK: verllvb %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x0e 0x73
-
-#CHECK: verllvf %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0x73
-
-#CHECK: verllvf %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0x73
-
-#CHECK: verllvf %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0x73
-
-#CHECK: verllvg %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0x73
-
-#CHECK: verllvg %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x3a 0x73
-
-#CHECK: verllvg %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x3e 0x73
-
-#CHECK: verllvh %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0x73
-
-#CHECK: verllvh %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x1a 0x73
-
-#CHECK: verllvh %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x1e 0x73
-
-#CHECK: verll %v0, %v0, 0, 11
+# CHECK: verll %v0, %v0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x33
-#CHECK: verll %v12, %v18, 1110(%r3), 11
+# CHECK: verll %v12, %v18, 1110(%r3), 11
0xe7 0xc2 0x34 0x56 0xb4 0x33
-#CHECK: verll %v31, %v31, 4095(%r15), 11
+# CHECK: verll %v31, %v31, 4095(%r15), 11
0xe7 0xff 0xff 0xff 0xbc 0x33
-#CHECK: verllb %v0, %v0, 0
+# CHECK: verllb %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x33
-#CHECK: verllb %v12, %v18, 1110(%r3)
+# CHECK: verllb %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x04 0x33
-#CHECK: verllb %v31, %v31, 4095(%r15)
+# CHECK: verllb %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x0c 0x33
-#CHECK: verllf %v0, %v0, 0
+# CHECK: verllf %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x33
-#CHECK: verllf %v12, %v18, 1110(%r3)
+# CHECK: verllf %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x24 0x33
-#CHECK: verllf %v31, %v31, 4095(%r15)
+# CHECK: verllf %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x2c 0x33
-#CHECK: verllg %v0, %v0, 0
+# CHECK: verllg %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x30 0x33
-#CHECK: verllg %v12, %v18, 1110(%r3)
+# CHECK: verllg %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x34 0x33
-#CHECK: verllg %v31, %v31, 4095(%r15)
+# CHECK: verllg %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x3c 0x33
-#CHECK: verllh %v0, %v0, 0
+# CHECK: verllh %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x33
-#CHECK: verllh %v12, %v18, 1110(%r3)
+# CHECK: verllh %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x14 0x33
-#CHECK: verllh %v31, %v31, 4095(%r15)
+# CHECK: verllh %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x1c 0x33
-#CHECK: veslv %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0x70
+# CHECK: verllv %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0x73
-#CHECK: veslv %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0x70
+# CHECK: verllv %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0x73
-#CHECK: veslv %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0x70
+# CHECK: verllv %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0x73
-#CHECK: veslvb %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0x70
+# CHECK: verllvb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x73
-#CHECK: veslvb %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x0a 0x70
+# CHECK: verllvb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x73
-#CHECK: veslvb %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x0e 0x70
+# CHECK: verllvb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x0e 0x73
-#CHECK: veslvf %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0x70
+# CHECK: verllvf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x73
-#CHECK: veslvf %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0x70
+# CHECK: verllvf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0x73
-#CHECK: veslvf %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0x70
+# CHECK: verllvf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0x73
-#CHECK: veslvg %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0x70
+# CHECK: verllvg %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0x73
-#CHECK: veslvg %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x3a 0x70
+# CHECK: verllvg %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x3a 0x73
-#CHECK: veslvg %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x3e 0x70
+# CHECK: verllvg %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x3e 0x73
-#CHECK: veslvh %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0x70
+# CHECK: verllvh %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x73
-#CHECK: veslvh %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x1a 0x70
+# CHECK: verllvh %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x1a 0x73
-#CHECK: veslvh %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x1e 0x70
+# CHECK: verllvh %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x1e 0x73
-#CHECK: vesl %v0, %v0, 0, 11
+# CHECK: vesl %v0, %v0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x30
-#CHECK: vesl %v12, %v18, 1110(%r3), 11
+# CHECK: vesl %v12, %v18, 1110(%r3), 11
0xe7 0xc2 0x34 0x56 0xb4 0x30
-#CHECK: vesl %v31, %v31, 4095(%r15), 11
+# CHECK: vesl %v31, %v31, 4095(%r15), 11
0xe7 0xff 0xff 0xff 0xbc 0x30
-#CHECK: veslb %v0, %v0, 0
+# CHECK: veslb %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x30
-#CHECK: veslb %v12, %v18, 1110(%r3)
+# CHECK: veslb %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x04 0x30
-#CHECK: veslb %v31, %v31, 4095(%r15)
+# CHECK: veslb %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x0c 0x30
-#CHECK: veslf %v0, %v0, 0
+# CHECK: veslf %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x30
-#CHECK: veslf %v12, %v18, 1110(%r3)
+# CHECK: veslf %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x24 0x30
-#CHECK: veslf %v31, %v31, 4095(%r15)
+# CHECK: veslf %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x2c 0x30
-#CHECK: veslg %v0, %v0, 0
+# CHECK: veslg %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x30 0x30
-#CHECK: veslg %v12, %v18, 1110(%r3)
+# CHECK: veslg %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x34 0x30
-#CHECK: veslg %v31, %v31, 4095(%r15)
+# CHECK: veslg %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x3c 0x30
-#CHECK: veslh %v0, %v0, 0
+# CHECK: veslh %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x30
-#CHECK: veslh %v12, %v18, 1110(%r3)
+# CHECK: veslh %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x14 0x30
-#CHECK: veslh %v31, %v31, 4095(%r15)
+# CHECK: veslh %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x1c 0x30
-#CHECK: vesrav %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0x7a
+# CHECK: veslv %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0x70
-#CHECK: vesrav %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0x7a
+# CHECK: veslv %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0x70
-#CHECK: vesrav %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0x7a
+# CHECK: veslv %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0x70
-#CHECK: vesravb %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0x7a
+# CHECK: veslvb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x70
-#CHECK: vesravb %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x0a 0x7a
+# CHECK: veslvb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x70
-#CHECK: vesravb %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x0e 0x7a
+# CHECK: veslvb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x0e 0x70
-#CHECK: vesravf %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0x7a
+# CHECK: veslvf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x70
-#CHECK: vesravf %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0x7a
+# CHECK: veslvf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0x70
-#CHECK: vesravf %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0x7a
+# CHECK: veslvf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0x70
-#CHECK: vesravg %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0x7a
+# CHECK: veslvg %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0x70
-#CHECK: vesravg %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x3a 0x7a
+# CHECK: veslvg %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x3a 0x70
-#CHECK: vesravg %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x3e 0x7a
+# CHECK: veslvg %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x3e 0x70
-#CHECK: vesravh %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0x7a
+# CHECK: veslvh %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x70
-#CHECK: vesravh %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x1a 0x7a
+# CHECK: veslvh %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x1a 0x70
-#CHECK: vesravh %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x1e 0x7a
+# CHECK: veslvh %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x1e 0x70
-#CHECK: vesra %v0, %v0, 0, 11
+# CHECK: vesra %v0, %v0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x3a
-#CHECK: vesra %v12, %v18, 1110(%r3), 11
+# CHECK: vesra %v12, %v18, 1110(%r3), 11
0xe7 0xc2 0x34 0x56 0xb4 0x3a
-#CHECK: vesra %v31, %v31, 4095(%r15), 11
+# CHECK: vesra %v31, %v31, 4095(%r15), 11
0xe7 0xff 0xff 0xff 0xbc 0x3a
-#CHECK: vesrab %v0, %v0, 0
+# CHECK: vesrab %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x3a
-#CHECK: vesrab %v12, %v18, 1110(%r3)
+# CHECK: vesrab %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x04 0x3a
-#CHECK: vesrab %v31, %v31, 4095(%r15)
+# CHECK: vesrab %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x0c 0x3a
-#CHECK: vesraf %v0, %v0, 0
+# CHECK: vesraf %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x3a
-#CHECK: vesraf %v12, %v18, 1110(%r3)
+# CHECK: vesraf %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x24 0x3a
-#CHECK: vesraf %v31, %v31, 4095(%r15)
+# CHECK: vesraf %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x2c 0x3a
-#CHECK: vesrag %v0, %v0, 0
+# CHECK: vesrag %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x30 0x3a
-#CHECK: vesrag %v12, %v18, 1110(%r3)
+# CHECK: vesrag %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x34 0x3a
-#CHECK: vesrag %v31, %v31, 4095(%r15)
+# CHECK: vesrag %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x3c 0x3a
-#CHECK: vesrah %v0, %v0, 0
+# CHECK: vesrah %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x3a
-#CHECK: vesrah %v12, %v18, 1110(%r3)
+# CHECK: vesrah %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x14 0x3a
-#CHECK: vesrah %v31, %v31, 4095(%r15)
+# CHECK: vesrah %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x1c 0x3a
-#CHECK: vesrlv %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0x78
+# CHECK: vesrav %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0x7a
-#CHECK: vesrlv %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0x78
+# CHECK: vesrav %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0x7a
-#CHECK: vesrlv %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0x78
+# CHECK: vesrav %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0x7a
-#CHECK: vesrlvb %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0x78
+# CHECK: vesravb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x7a
-#CHECK: vesrlvb %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x0a 0x78
+# CHECK: vesravb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x7a
-#CHECK: vesrlvb %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x0e 0x78
+# CHECK: vesravb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x0e 0x7a
-#CHECK: vesrlvf %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0x78
+# CHECK: vesravf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x7a
-#CHECK: vesrlvf %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0x78
+# CHECK: vesravf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0x7a
-#CHECK: vesrlvf %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0x78
+# CHECK: vesravf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0x7a
-#CHECK: vesrlvg %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0x78
+# CHECK: vesravg %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0x7a
-#CHECK: vesrlvg %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x3a 0x78
+# CHECK: vesravg %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x3a 0x7a
-#CHECK: vesrlvg %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x3e 0x78
+# CHECK: vesravg %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x3e 0x7a
-#CHECK: vesrlvh %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0x78
+# CHECK: vesravh %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x7a
-#CHECK: vesrlvh %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x1a 0x78
+# CHECK: vesravh %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x1a 0x7a
-#CHECK: vesrlvh %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x1e 0x78
+# CHECK: vesravh %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x1e 0x7a
-#CHECK: vesrl %v0, %v0, 0, 11
+# CHECK: vesrl %v0, %v0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x38
-#CHECK: vesrl %v12, %v18, 1110(%r3), 11
+# CHECK: vesrl %v12, %v18, 1110(%r3), 11
0xe7 0xc2 0x34 0x56 0xb4 0x38
-#CHECK: vesrl %v31, %v31, 4095(%r15), 11
+# CHECK: vesrl %v31, %v31, 4095(%r15), 11
0xe7 0xff 0xff 0xff 0xbc 0x38
-#CHECK: vesrlb %v0, %v0, 0
+# CHECK: vesrlb %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x38
-#CHECK: vesrlb %v12, %v18, 1110(%r3)
+# CHECK: vesrlb %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x04 0x38
-#CHECK: vesrlb %v31, %v31, 4095(%r15)
+# CHECK: vesrlb %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x0c 0x38
-#CHECK: vesrlf %v0, %v0, 0
+# CHECK: vesrlf %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x38
-#CHECK: vesrlf %v12, %v18, 1110(%r3)
+# CHECK: vesrlf %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x24 0x38
-#CHECK: vesrlf %v31, %v31, 4095(%r15)
+# CHECK: vesrlf %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x2c 0x38
-#CHECK: vesrlg %v0, %v0, 0
+# CHECK: vesrlg %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x30 0x38
-#CHECK: vesrlg %v12, %v18, 1110(%r3)
+# CHECK: vesrlg %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x34 0x38
-#CHECK: vesrlg %v31, %v31, 4095(%r15)
+# CHECK: vesrlg %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x3c 0x38
-#CHECK: vesrlh %v0, %v0, 0
+# CHECK: vesrlh %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x38
-#CHECK: vesrlh %v12, %v18, 1110(%r3)
+# CHECK: vesrlh %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x14 0x38
-#CHECK: vesrlh %v31, %v31, 4095(%r15)
+# CHECK: vesrlh %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x1c 0x38
-#CHECK: vfa %v0, %v0, %v0, 11, 9
+# CHECK: vesrlv %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0x78
+
+# CHECK: vesrlv %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0x78
+
+# CHECK: vesrlv %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0x78
+
+# CHECK: vesrlvb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x78
+
+# CHECK: vesrlvb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x78
+
+# CHECK: vesrlvb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x0e 0x78
+
+# CHECK: vesrlvf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x78
+
+# CHECK: vesrlvf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0x78
+
+# CHECK: vesrlvf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0x78
+
+# CHECK: vesrlvg %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0x78
+
+# CHECK: vesrlvg %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x3a 0x78
+
+# CHECK: vesrlvg %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x3e 0x78
+
+# CHECK: vesrlvh %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x78
+
+# CHECK: vesrlvh %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x1a 0x78
+
+# CHECK: vesrlvh %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x1e 0x78
+
+# CHECK: vfa %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x09 0xb0 0xe3
-#CHECK: vfa %v18, %v3, %v20, 11, 9
+# CHECK: vfa %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x09 0xba 0xe3
-#CHECK: vfa %v31, %v31, %v31, 11, 9
+# CHECK: vfa %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x09 0xbe 0xe3
-#CHECK: vfadb %v0, %v0, %v0
+# CHECK: vfadb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xe3
-#CHECK: vfadb %v18, %v3, %v20
+# CHECK: vfadb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xe3
-#CHECK: vfadb %v31, %v31, %v31
+# CHECK: vfadb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xe3
-#CHECK: vfae %v0, %v0, %v0, 11, 0
+# CHECK: vfae %v0, %v0, %v0, 11, 0
0xe7 0x00 0x00 0x00 0xb0 0x82
-#CHECK: vfae %v0, %v0, %v0, 11, 12
+# CHECK: vfae %v0, %v0, %v0, 11, 12
0xe7 0x00 0x00 0xc0 0xb0 0x82
-#CHECK: vfae %v18, %v3, %v20, 11, 0
+# CHECK: vfae %v18, %v3, %v20, 11, 0
0xe7 0x23 0x40 0x00 0xba 0x82
-#CHECK: vfae %v31, %v31, %v31, 11, 4
+# CHECK: vfae %v31, %v31, %v31, 11, 4
0xe7 0xff 0xf0 0x40 0xbe 0x82
-#CHECK: vfaeb %v0, %v0, %v0, 0
+# CHECK: vfaeb %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x82
-#CHECK: vfaeb %v0, %v0, %v0, 12
+# CHECK: vfaeb %v0, %v0, %v0, 12
0xe7 0x00 0x00 0xc0 0x00 0x82
-#CHECK: vfaeb %v18, %v3, %v20, 0
+# CHECK: vfaeb %v18, %v3, %v20, 0
0xe7 0x23 0x40 0x00 0x0a 0x82
-#CHECK: vfaeb %v31, %v31, %v31, 4
+# CHECK: vfaeb %v31, %v31, %v31, 4
0xe7 0xff 0xf0 0x40 0x0e 0x82
-#CHECK: vfaebs %v31, %v31, %v31, 8
+# CHECK: vfaebs %v31, %v31, %v31, 8
0xe7 0xff 0xf0 0x90 0x0e 0x82
-#CHECK: vfaezb %v31, %v31, %v31, 4
+# CHECK: vfaezb %v31, %v31, %v31, 4
0xe7 0xff 0xf0 0x60 0x0e 0x82
-#CHECK: vfaezbs %v31, %v31, %v31, 8
+# CHECK: vfaezbs %v31, %v31, %v31, 8
0xe7 0xff 0xf0 0xb0 0x0e 0x82
-#CHECK: vfaef %v0, %v0, %v0, 0
+# CHECK: vfaef %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x82
-#CHECK: vfaef %v0, %v0, %v0, 12
+# CHECK: vfaef %v0, %v0, %v0, 12
0xe7 0x00 0x00 0xc0 0x20 0x82
-#CHECK: vfaef %v18, %v3, %v20, 0
+# CHECK: vfaef %v18, %v3, %v20, 0
0xe7 0x23 0x40 0x00 0x2a 0x82
-#CHECK: vfaef %v31, %v31, %v31, 4
+# CHECK: vfaef %v31, %v31, %v31, 4
0xe7 0xff 0xf0 0x40 0x2e 0x82
-#CHECK: vfaefs %v31, %v31, %v31, 8
+# CHECK: vfaefs %v31, %v31, %v31, 8
0xe7 0xff 0xf0 0x90 0x2e 0x82
-#CHECK: vfaezf %v31, %v31, %v31, 4
+# CHECK: vfaezf %v31, %v31, %v31, 4
0xe7 0xff 0xf0 0x60 0x2e 0x82
-#CHECK: vfaezfs %v31, %v31, %v31, 8
+# CHECK: vfaezfs %v31, %v31, %v31, 8
0xe7 0xff 0xf0 0xb0 0x2e 0x82
-#CHECK: vfaeh %v0, %v0, %v0, 0
+# CHECK: vfaeh %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x82
-#CHECK: vfaeh %v0, %v0, %v0, 12
+# CHECK: vfaeh %v0, %v0, %v0, 12
0xe7 0x00 0x00 0xc0 0x10 0x82
-#CHECK: vfaeh %v18, %v3, %v20, 0
+# CHECK: vfaeh %v18, %v3, %v20, 0
0xe7 0x23 0x40 0x00 0x1a 0x82
-#CHECK: vfaeh %v31, %v31, %v31, 4
+# CHECK: vfaeh %v31, %v31, %v31, 4
0xe7 0xff 0xf0 0x40 0x1e 0x82
-#CHECK: vfaehs %v31, %v31, %v31, 8
+# CHECK: vfaehs %v31, %v31, %v31, 8
0xe7 0xff 0xf0 0x90 0x1e 0x82
-#CHECK: vfaezh %v31, %v31, %v31, 4
+# CHECK: vfaezh %v31, %v31, %v31, 4
0xe7 0xff 0xf0 0x60 0x1e 0x82
-#CHECK: vfaezhs %v31, %v31, %v31, 8
+# CHECK: vfaezhs %v31, %v31, %v31, 8
0xe7 0xff 0xf0 0xb0 0x1e 0x82
-#CHECK: vfce %v0, %v0, %v0, 11, 9
+# CHECK: vfce %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x09 0xb0 0xe8
-#CHECK: vfce %v18, %v3, %v20, 11, 9
+# CHECK: vfce %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x09 0xba 0xe8
-#CHECK: vfce %v31, %v31, %v31, 11, 9
+# CHECK: vfce %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x09 0xbe 0xe8
-#CHECK: vfcedb %v0, %v0, %v0
+# CHECK: vfcedb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xe8
-#CHECK: vfcedb %v18, %v3, %v20
+# CHECK: vfcedb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xe8
-#CHECK: vfcedb %v31, %v31, %v31
+# CHECK: vfcedb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xe8
-#CHECK: vfcedbs %v0, %v0, %v0
+# CHECK: vfcedbs %v0, %v0, %v0
0xe7 0x00 0x00 0x10 0x30 0xe8
-#CHECK: vfcedbs %v18, %v3, %v20
+# CHECK: vfcedbs %v18, %v3, %v20
0xe7 0x23 0x40 0x10 0x3a 0xe8
-#CHECK: vfcedbs %v31, %v31, %v31
+# CHECK: vfcedbs %v31, %v31, %v31
0xe7 0xff 0xf0 0x10 0x3e 0xe8
-#CHECK: vfch %v0, %v0, %v0, 11, 9
+# CHECK: vfch %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x09 0xb0 0xeb
-#CHECK: vfch %v18, %v3, %v20, 11, 9
+# CHECK: vfch %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x09 0xba 0xeb
-#CHECK: vfch %v31, %v31, %v31, 11, 9
+# CHECK: vfch %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x09 0xbe 0xeb
-#CHECK: vfchdb %v0, %v0, %v0
+# CHECK: vfchdb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xeb
-#CHECK: vfchdb %v18, %v3, %v20
+# CHECK: vfchdb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xeb
-#CHECK: vfchdb %v31, %v31, %v31
+# CHECK: vfchdb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xeb
-#CHECK: vfchdbs %v0, %v0, %v0
+# CHECK: vfchdbs %v0, %v0, %v0
0xe7 0x00 0x00 0x10 0x30 0xeb
-#CHECK: vfchdbs %v18, %v3, %v20
+# CHECK: vfchdbs %v18, %v3, %v20
0xe7 0x23 0x40 0x10 0x3a 0xeb
-#CHECK: vfchdbs %v31, %v31, %v31
+# CHECK: vfchdbs %v31, %v31, %v31
0xe7 0xff 0xf0 0x10 0x3e 0xeb
-#CHECK: vfche %v0, %v0, %v0, 11, 9
+# CHECK: vfche %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x09 0xb0 0xea
-#CHECK: vfche %v18, %v3, %v20, 11, 9
+# CHECK: vfche %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x09 0xba 0xea
-#CHECK: vfche %v31, %v31, %v31, 11, 9
+# CHECK: vfche %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x09 0xbe 0xea
-#CHECK: vfchedb %v0, %v0, %v0
+# CHECK: vfchedb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xea
-#CHECK: vfchedb %v18, %v3, %v20
+# CHECK: vfchedb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xea
-#CHECK: vfchedb %v31, %v31, %v31
+# CHECK: vfchedb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xea
-#CHECK: vfchedbs %v0, %v0, %v0
+# CHECK: vfchedbs %v0, %v0, %v0
0xe7 0x00 0x00 0x10 0x30 0xea
-#CHECK: vfchedbs %v18, %v3, %v20
+# CHECK: vfchedbs %v18, %v3, %v20
0xe7 0x23 0x40 0x10 0x3a 0xea
-#CHECK: vfchedbs %v31, %v31, %v31
+# CHECK: vfchedbs %v31, %v31, %v31
0xe7 0xff 0xf0 0x10 0x3e 0xea
-#CHECK: vfd %v0, %v0, %v0, 11, 9
+# CHECK: vfd %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x09 0xb0 0xe5
-#CHECK: vfd %v18, %v3, %v20, 11, 9
+# CHECK: vfd %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x09 0xba 0xe5
-#CHECK: vfd %v31, %v31, %v31, 11, 9
+# CHECK: vfd %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x09 0xbe 0xe5
-#CHECK: vfddb %v0, %v0, %v0
+# CHECK: vfddb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xe5
-#CHECK: vfddb %v18, %v3, %v20
+# CHECK: vfddb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xe5
-#CHECK: vfddb %v31, %v31, %v31
+# CHECK: vfddb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xe5
-#CHECK: vfee %v0, %v0, %v0, 11, 0
+# CHECK: vfee %v0, %v0, %v0, 11, 0
0xe7 0x00 0x00 0x00 0xb0 0x80
-#CHECK: vfee %v0, %v0, %v0, 11, 12
+# CHECK: vfee %v0, %v0, %v0, 11, 12
0xe7 0x00 0x00 0xc0 0xb0 0x80
-#CHECK: vfee %v18, %v3, %v20, 11, 0
+# CHECK: vfee %v18, %v3, %v20, 11, 0
0xe7 0x23 0x40 0x00 0xba 0x80
-#CHECK: vfee %v31, %v31, %v31, 11, 0
+# CHECK: vfee %v31, %v31, %v31, 11, 0
0xe7 0xff 0xf0 0x00 0xbe 0x80
-#CHECK: vfeeb %v0, %v0, %v0, 0
+# CHECK: vfeeb %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x80
-#CHECK: vfeeb %v0, %v0, %v0, 12
+# CHECK: vfeeb %v0, %v0, %v0, 12
0xe7 0x00 0x00 0xc0 0x00 0x80
-#CHECK: vfeeb %v18, %v3, %v20, 0
+# CHECK: vfeeb %v18, %v3, %v20, 0
0xe7 0x23 0x40 0x00 0x0a 0x80
-#CHECK: vfeebs %v7, %v24, %v9
+# CHECK: vfeeb %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x0e 0x80
+
+# CHECK: vfeebs %v7, %v24, %v9
0xe7 0x78 0x90 0x10 0x04 0x80
-#CHECK: vfeezb %v18, %v3, %v20
+# CHECK: vfeezb %v18, %v3, %v20
0xe7 0x23 0x40 0x20 0x0a 0x80
-#CHECK: vfeezbs %v7, %v24, %v9
+# CHECK: vfeezbs %v7, %v24, %v9
0xe7 0x78 0x90 0x30 0x04 0x80
-#CHECK: vfeeb %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x0e 0x80
-
-#CHECK: vfeef %v0, %v0, %v0, 0
+# CHECK: vfeef %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x80
-#CHECK: vfeef %v0, %v0, %v0, 12
+# CHECK: vfeef %v0, %v0, %v0, 12
0xe7 0x00 0x00 0xc0 0x20 0x80
-#CHECK: vfeef %v18, %v3, %v20, 0
+# CHECK: vfeef %v18, %v3, %v20, 0
0xe7 0x23 0x40 0x00 0x2a 0x80
-#CHECK: vfeefs %v7, %v24, %v9
+# CHECK: vfeef %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x2e 0x80
+
+# CHECK: vfeefs %v7, %v24, %v9
0xe7 0x78 0x90 0x10 0x24 0x80
-#CHECK: vfeezf %v18, %v3, %v20
+# CHECK: vfeezf %v18, %v3, %v20
0xe7 0x23 0x40 0x20 0x2a 0x80
-#CHECK: vfeezfs %v7, %v24, %v9
+# CHECK: vfeezfs %v7, %v24, %v9
0xe7 0x78 0x90 0x30 0x24 0x80
-#CHECK: vfeef %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x2e 0x80
-
-#CHECK: vfeeh %v0, %v0, %v0, 0
+# CHECK: vfeeh %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x80
-#CHECK: vfeeh %v0, %v0, %v0, 12
+# CHECK: vfeeh %v0, %v0, %v0, 12
0xe7 0x00 0x00 0xc0 0x10 0x80
-#CHECK: vfeeh %v18, %v3, %v20, 0
+# CHECK: vfeeh %v18, %v3, %v20, 0
0xe7 0x23 0x40 0x00 0x1a 0x80
-#CHECK: vfeehs %v7, %v24, %v9
+# CHECK: vfeeh %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x1e 0x80
+
+# CHECK: vfeehs %v7, %v24, %v9
0xe7 0x78 0x90 0x10 0x14 0x80
-#CHECK: vfeezh %v18, %v3, %v20
+# CHECK: vfeezh %v18, %v3, %v20
0xe7 0x23 0x40 0x20 0x1a 0x80
-#CHECK: vfeezhs %v7, %v24, %v9
+# CHECK: vfeezhs %v7, %v24, %v9
0xe7 0x78 0x90 0x30 0x14 0x80
-#CHECK: vfeeh %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x1e 0x80
-
-#CHECK: vfene %v0, %v0, %v0, 11, 0
+# CHECK: vfene %v0, %v0, %v0, 11, 0
0xe7 0x00 0x00 0x00 0xb0 0x81
-#CHECK: vfene %v0, %v0, %v0, 11, 12
+# CHECK: vfene %v0, %v0, %v0, 11, 12
0xe7 0x00 0x00 0xc0 0xb0 0x81
-#CHECK: vfene %v18, %v3, %v20, 11, 0
+# CHECK: vfene %v18, %v3, %v20, 11, 0
0xe7 0x23 0x40 0x00 0xba 0x81
-#CHECK: vfene %v31, %v31, %v31, 11, 0
+# CHECK: vfene %v31, %v31, %v31, 11, 0
0xe7 0xff 0xf0 0x00 0xbe 0x81
-#CHECK: vfeneb %v0, %v0, %v0, 0
+# CHECK: vfeneb %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x81
-#CHECK: vfeneb %v0, %v0, %v0, 12
+# CHECK: vfeneb %v0, %v0, %v0, 12
0xe7 0x00 0x00 0xc0 0x00 0x81
-#CHECK: vfeneb %v18, %v3, %v20, 0
+# CHECK: vfeneb %v18, %v3, %v20, 0
0xe7 0x23 0x40 0x00 0x0a 0x81
-#CHECK: vfenebs %v7, %v24, %v9
+# CHECK: vfenebs %v7, %v24, %v9
0xe7 0x78 0x90 0x10 0x04 0x81
-#CHECK: vfenezb %v18, %v3, %v20
+# CHECK: vfeneb %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x0e 0x81
+
+# CHECK: vfenezb %v18, %v3, %v20
0xe7 0x23 0x40 0x20 0x0a 0x81
-#CHECK: vfenezbs %v7, %v24, %v9
+# CHECK: vfenezbs %v7, %v24, %v9
0xe7 0x78 0x90 0x30 0x04 0x81
-#CHECK: vfeneb %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x0e 0x81
-
-#CHECK: vfenef %v0, %v0, %v0, 0
+# CHECK: vfenef %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x81
-#CHECK: vfenef %v0, %v0, %v0, 12
+# CHECK: vfenef %v0, %v0, %v0, 12
0xe7 0x00 0x00 0xc0 0x20 0x81
-#CHECK: vfenef %v18, %v3, %v20, 0
+# CHECK: vfenef %v18, %v3, %v20, 0
0xe7 0x23 0x40 0x00 0x2a 0x81
-#CHECK: vfenefs %v7, %v24, %v9
+# CHECK: vfenef %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x2e 0x81
+
+# CHECK: vfenefs %v7, %v24, %v9
0xe7 0x78 0x90 0x10 0x24 0x81
-#CHECK: vfenezf %v18, %v3, %v20
+# CHECK: vfenezf %v18, %v3, %v20
0xe7 0x23 0x40 0x20 0x2a 0x81
-#CHECK: vfenezfs %v7, %v24, %v9
+# CHECK: vfenezfs %v7, %v24, %v9
0xe7 0x78 0x90 0x30 0x24 0x81
-#CHECK: vfenef %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x2e 0x81
-
-#CHECK: vfeneh %v0, %v0, %v0, 0
+# CHECK: vfeneh %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x81
-#CHECK: vfeneh %v0, %v0, %v0, 12
+# CHECK: vfeneh %v0, %v0, %v0, 12
0xe7 0x00 0x00 0xc0 0x10 0x81
-#CHECK: vfeneh %v18, %v3, %v20, 0
+# CHECK: vfeneh %v18, %v3, %v20, 0
0xe7 0x23 0x40 0x00 0x1a 0x81
-#CHECK: vfenehs %v7, %v24, %v9
+# CHECK: vfeneh %v31, %v31, %v31, 0
+0xe7 0xff 0xf0 0x00 0x1e 0x81
+
+# CHECK: vfenehs %v7, %v24, %v9
0xe7 0x78 0x90 0x10 0x14 0x81
-#CHECK: vfenezh %v18, %v3, %v20
+# CHECK: vfenezh %v18, %v3, %v20
0xe7 0x23 0x40 0x20 0x1a 0x81
-#CHECK: vfenezhs %v7, %v24, %v9
+# CHECK: vfenezhs %v7, %v24, %v9
0xe7 0x78 0x90 0x30 0x14 0x81
-#CHECK: vfeneh %v31, %v31, %v31, 0
-0xe7 0xff 0xf0 0x00 0x1e 0x81
-
-#CHECK: vfi %v0, %v0, 11, 0, 0
+# CHECK: vfi %v0, %v0, 11, 0, 0
0xe7 0x00 0x00 0x00 0xb0 0xc7
-#CHECK: vfi %v19, %v14, 11, 4, 10
+# CHECK: vfi %v19, %v14, 11, 4, 10
0xe7 0x3e 0x00 0xa4 0xb8 0xc7
-#CHECK: vfi %v31, %v31, 11, 7, 15
+# CHECK: vfi %v31, %v31, 11, 7, 15
0xe7 0xff 0x00 0xf7 0xbc 0xc7
-#CHECK: vfidb %v0, %v0, 0, 0
+# CHECK: vfidb %v0, %v0, 0, 0
0xe7 0x00 0x00 0x00 0x30 0xc7
-#CHECK: vfidb %v19, %v14, 4, 10
+# CHECK: vfidb %v19, %v14, 4, 10
0xe7 0x3e 0x00 0xa4 0x38 0xc7
-#CHECK: vfidb %v31, %v31, 7, 15
+# CHECK: vfidb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xf7 0x3c 0xc7
-#CHECK: vistr %v0, %v0, 11, 0
-0xe7 0x00 0x00 0x00 0xb0 0x5c
-
-#CHECK: vistr %v0, %v0, 11, 12
-0xe7 0x00 0x00 0xc0 0xb0 0x5c
-
-#CHECK: vistr %v18, %v3, 11, 0
-0xe7 0x23 0x00 0x00 0xb8 0x5c
-
-#CHECK: vistr %v31, %v31, 11, 0
-0xe7 0xff 0x00 0x00 0xbc 0x5c
-
-#CHECK: vistrb %v0, %v0, 0
-0xe7 0x00 0x00 0x00 0x00 0x5c
-
-#CHECK: vistrb %v0, %v0, 12
-0xe7 0x00 0x00 0xc0 0x00 0x5c
-
-#CHECK: vistrb %v18, %v3, 0
-0xe7 0x23 0x00 0x00 0x08 0x5c
-
-#CHECK: vistrbs %v7, %v24
-0xe7 0x78 0x00 0x10 0x04 0x5c
+# CHECK: vflcdb %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0xcc
-#CHECK: vistrb %v31, %v31, 0
-0xe7 0xff 0x00 0x00 0x0c 0x5c
+# CHECK: vflcdb %v19, %v14
+0xe7 0x3e 0x00 0x00 0x38 0xcc
-#CHECK: vistrf %v0, %v0, 0
-0xe7 0x00 0x00 0x00 0x20 0x5c
+# CHECK: vflcdb %v31, %v31
+0xe7 0xff 0x00 0x00 0x3c 0xcc
-#CHECK: vistrf %v0, %v0, 12
-0xe7 0x00 0x00 0xc0 0x20 0x5c
+# CHECK: vflndb %v0, %v0
+0xe7 0x00 0x00 0x10 0x30 0xcc
-#CHECK: vistrf %v18, %v3, 0
-0xe7 0x23 0x00 0x00 0x28 0x5c
+# CHECK: vflndb %v19, %v14
+0xe7 0x3e 0x00 0x10 0x38 0xcc
-#CHECK: vistrfs %v7, %v24
-0xe7 0x78 0x00 0x10 0x24 0x5c
+# CHECK: vflndb %v31, %v31
+0xe7 0xff 0x00 0x10 0x3c 0xcc
-#CHECK: vistrf %v31, %v31, 0
-0xe7 0xff 0x00 0x00 0x2c 0x5c
+# CHECK: vflpdb %v0, %v0
+0xe7 0x00 0x00 0x20 0x30 0xcc
-#CHECK: vistrh %v0, %v0, 0
-0xe7 0x00 0x00 0x00 0x10 0x5c
+# CHECK: vflpdb %v19, %v14
+0xe7 0x3e 0x00 0x20 0x38 0xcc
-#CHECK: vistrh %v0, %v0, 12
-0xe7 0x00 0x00 0xc0 0x10 0x5c
+# CHECK: vflpdb %v31, %v31
+0xe7 0xff 0x00 0x20 0x3c 0xcc
-#CHECK: vistrh %v18, %v3, 0
-0xe7 0x23 0x00 0x00 0x18 0x5c
+# CHECK: vfm %v0, %v0, %v0, 11, 9
+0xe7 0x00 0x00 0x09 0xb0 0xe7
-#CHECK: vistrhs %v7, %v24
-0xe7 0x78 0x00 0x10 0x14 0x5c
+# CHECK: vfm %v18, %v3, %v20, 11, 9
+0xe7 0x23 0x40 0x09 0xba 0xe7
-#CHECK: vistrh %v31, %v31, 0
-0xe7 0xff 0x00 0x00 0x1c 0x5c
+# CHECK: vfm %v31, %v31, %v31, 11, 9
+0xe7 0xff 0xf0 0x09 0xbe 0xe7
-#CHECK: vfma %v0, %v0, %v0, %v0, 9, 11
+# CHECK: vfma %v0, %v0, %v0, %v0, 9, 11
0xe7 0x00 0x0b 0x09 0x00 0x8f
-#CHECK: vfma %v3, %v20, %v5, %v22, 9, 11
+# CHECK: vfma %v3, %v20, %v5, %v22, 9, 11
0xe7 0x34 0x5b 0x09 0x65 0x8f
-#CHECK: vfma %v31, %v31, %v31, %v31, 9, 11
+# CHECK: vfma %v31, %v31, %v31, %v31, 9, 11
0xe7 0xff 0xfb 0x09 0xff 0x8f
-#CHECK: vfmadb %v0, %v0, %v0, %v0
+# CHECK: vfmadb %v0, %v0, %v0, %v0
0xe7 0x00 0x03 0x00 0x00 0x8f
-#CHECK: vfmadb %v3, %v20, %v5, %v22
+# CHECK: vfmadb %v3, %v20, %v5, %v22
0xe7 0x34 0x53 0x00 0x65 0x8f
-#CHECK: vfmadb %v31, %v31, %v31, %v31
+# CHECK: vfmadb %v31, %v31, %v31, %v31
0xe7 0xff 0xf3 0x00 0xff 0x8f
-#CHECK: vfm %v0, %v0, %v0, 11, 9
-0xe7 0x00 0x00 0x09 0xb0 0xe7
-
-#CHECK: vfm %v18, %v3, %v20, 11, 9
-0xe7 0x23 0x40 0x09 0xba 0xe7
-
-#CHECK: vfm %v31, %v31, %v31, 11, 9
-0xe7 0xff 0xf0 0x09 0xbe 0xe7
-
-#CHECK: vfmdb %v0, %v0, %v0
+# CHECK: vfmdb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xe7
-#CHECK: vfmdb %v18, %v3, %v20
+# CHECK: vfmdb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xe7
-#CHECK: vfmdb %v31, %v31, %v31
+# CHECK: vfmdb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xe7
-#CHECK: vfms %v0, %v0, %v0, %v0, 9, 11
+# CHECK: vfms %v0, %v0, %v0, %v0, 9, 11
0xe7 0x00 0x0b 0x09 0x00 0x8e
-#CHECK: vfms %v3, %v20, %v5, %v22, 9, 11
+# CHECK: vfms %v3, %v20, %v5, %v22, 9, 11
0xe7 0x34 0x5b 0x09 0x65 0x8e
-#CHECK: vfms %v31, %v31, %v31, %v31, 9, 11
+# CHECK: vfms %v31, %v31, %v31, %v31, 9, 11
0xe7 0xff 0xfb 0x09 0xff 0x8e
-#CHECK: vfmsdb %v0, %v0, %v0, %v0
+# CHECK: vfmsdb %v0, %v0, %v0, %v0
0xe7 0x00 0x03 0x00 0x00 0x8e
-#CHECK: vfmsdb %v3, %v20, %v5, %v22
+# CHECK: vfmsdb %v3, %v20, %v5, %v22
0xe7 0x34 0x53 0x00 0x65 0x8e
-#CHECK: vfmsdb %v31, %v31, %v31, %v31
+# CHECK: vfmsdb %v31, %v31, %v31, %v31
0xe7 0xff 0xf3 0x00 0xff 0x8e
-#CHECK: vfs %v0, %v0, %v0, 11, 9
+# CHECK: vfpso %v0, %v0, 11, 9, 7
+0xe7 0x00 0x00 0x79 0xb0 0xcc
+
+# CHECK: vfpso %v19, %v14, 11, 9, 7
+0xe7 0x3e 0x00 0x79 0xb8 0xcc
+
+# CHECK: vfpso %v31, %v31, 11, 9, 7
+0xe7 0xff 0x00 0x79 0xbc 0xcc
+
+# CHECK: vfpsodb %v0, %v0, 7
+0xe7 0x00 0x00 0x70 0x30 0xcc
+
+# CHECK: vfpsodb %v19, %v14, 7
+0xe7 0x3e 0x00 0x70 0x38 0xcc
+
+# CHECK: vfpsodb %v31, %v31, 7
+0xe7 0xff 0x00 0x70 0x3c 0xcc
+
+# CHECK: vfs %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x09 0xb0 0xe2
-#CHECK: vfs %v18, %v3, %v20, 11, 9
+# CHECK: vfs %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x09 0xba 0xe2
-#CHECK: vfs %v31, %v31, %v31, 11, 9
+# CHECK: vfs %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x09 0xbe 0xe2
-#CHECK: vfsdb %v0, %v0, %v0
+# CHECK: vfsdb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xe2
-#CHECK: vfsdb %v18, %v3, %v20
+# CHECK: vfsdb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xe2
-#CHECK: vfsdb %v31, %v31, %v31
+# CHECK: vfsdb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xe2
-#CHECK: vzero %v0
-0xe7 0x00 0x00 0x00 0x00 0x44
+# CHECK: vfsq %v0, %v0, 11, 9
+0xe7 0x00 0x00 0x09 0xb0 0xce
+
+# CHECK: vfsq %v19, %v14, 11, 9
+0xe7 0x3e 0x00 0x09 0xb8 0xce
-#CHECK: vgbm %v0, 1
+# CHECK: vfsq %v31, %v31, 11, 9
+0xe7 0xff 0x00 0x09 0xbc 0xce
+
+# CHECK: vfsqdb %v0, %v0
+0xe7 0x00 0x00 0x00 0x30 0xce
+
+# CHECK: vfsqdb %v19, %v14
+0xe7 0x3e 0x00 0x00 0x38 0xce
+
+# CHECK: vfsqdb %v31, %v31
+0xe7 0xff 0x00 0x00 0x3c 0xce
+
+# CHECK: vftci %v0, %v0, 0, 11, 9
+0xe7 0x00 0x00 0x09 0xb0 0x4a
+
+# CHECK: vftci %v19, %v4, 1383, 11, 9
+0xe7 0x34 0x56 0x79 0xb8 0x4a
+
+# CHECK: vftci %v31, %v31, 4095, 11, 9
+0xe7 0xff 0xff 0xf9 0xbc 0x4a
+
+# CHECK: vftcidb %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x30 0x4a
+
+# CHECK: vftcidb %v19, %v4, 1383
+0xe7 0x34 0x56 0x70 0x38 0x4a
+
+# CHECK: vftcidb %v31, %v31, 4095
+0xe7 0xff 0xff 0xf0 0x3c 0x4a
+
+# CHECK: vgbm %v0, 1
0xe7 0x00 0x00 0x01 0x00 0x44
-#CHECK: vgbm %v0, 65534
+# CHECK: vgbm %v0, 65534
0xe7 0x00 0xff 0xfe 0x00 0x44
-#CHECK: vone %v0
-0xe7 0x00 0xff 0xff 0x00 0x44
-
-#CHECK: vgbm %v17, 4660
+# CHECK: vgbm %v17, 4660
0xe7 0x10 0x12 0x34 0x08 0x44
-#CHECK: vone %v31
+# CHECK: vzero %v0
+0xe7 0x00 0x00 0x00 0x00 0x44
+
+# CHECK: vone %v0
+0xe7 0x00 0xff 0xff 0x00 0x44
+
+# CHECK: vone %v31
0xe7 0xf0 0xff 0xff 0x08 0x44
-#CHECK: vgef %v0, 0(%v0), 0
+# CHECK: vgef %v0, 0(%v0), 0
0xe7 0x00 0x00 0x00 0x00 0x13
-#CHECK: vgef %v10, 1000(%v19,%r7), 2
+# CHECK: vgef %v10, 1000(%v19,%r7), 2
0xe7 0xa3 0x73 0xe8 0x24 0x13
-#CHECK: vgef %v31, 4095(%v31,%r15), 3
+# CHECK: vgef %v31, 4095(%v31,%r15), 3
0xe7 0xff 0xff 0xff 0x3c 0x13
-#CHECK: vgeg %v0, 0(%v0), 0
+# CHECK: vgeg %v0, 0(%v0), 0
0xe7 0x00 0x00 0x00 0x00 0x12
-#CHECK: vgeg %v10, 1000(%v19,%r7), 1
+# CHECK: vgeg %v10, 1000(%v19,%r7), 1
0xe7 0xa3 0x73 0xe8 0x14 0x12
-#CHECK: vgeg %v31, 4095(%v31,%r15), 1
+# CHECK: vgeg %v31, 4095(%v31,%r15), 1
0xe7 0xff 0xff 0xff 0x1c 0x12
-#CHECK: vgfma %v0, %v0, %v0, %v0, 11
+# CHECK: vgfm %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0xb4
+
+# CHECK: vgfm %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0xb4
+
+# CHECK: vgfm %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0xb4
+
+# CHECK: vgfma %v0, %v0, %v0, %v0, 11
0xe7 0x00 0x0b 0x00 0x00 0xbc
-#CHECK: vgfma %v3, %v20, %v5, %v22, 11
+# CHECK: vgfma %v3, %v20, %v5, %v22, 11
0xe7 0x34 0x5b 0x00 0x65 0xbc
-#CHECK: vgfma %v31, %v31, %v31, %v31, 11
+# CHECK: vgfma %v31, %v31, %v31, %v31, 11
0xe7 0xff 0xfb 0x00 0xff 0xbc
-#CHECK: vgfmab %v0, %v0, %v0, %v0
+# CHECK: vgfmab %v0, %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xbc
-#CHECK: vgfmab %v3, %v20, %v5, %v22
+# CHECK: vgfmab %v3, %v20, %v5, %v22
0xe7 0x34 0x50 0x00 0x65 0xbc
-#CHECK: vgfmab %v31, %v31, %v31, %v31
+# CHECK: vgfmab %v31, %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0xff 0xbc
-#CHECK: vgfmaf %v0, %v0, %v0, %v0
+# CHECK: vgfmaf %v0, %v0, %v0, %v0
0xe7 0x00 0x02 0x00 0x00 0xbc
-#CHECK: vgfmaf %v3, %v20, %v5, %v22
+# CHECK: vgfmaf %v3, %v20, %v5, %v22
0xe7 0x34 0x52 0x00 0x65 0xbc
-#CHECK: vgfmaf %v31, %v31, %v31, %v31
+# CHECK: vgfmaf %v31, %v31, %v31, %v31
0xe7 0xff 0xf2 0x00 0xff 0xbc
-#CHECK: vgfmag %v0, %v0, %v0, %v0
+# CHECK: vgfmag %v0, %v0, %v0, %v0
0xe7 0x00 0x03 0x00 0x00 0xbc
-#CHECK: vgfmag %v3, %v20, %v5, %v22
+# CHECK: vgfmag %v3, %v20, %v5, %v22
0xe7 0x34 0x53 0x00 0x65 0xbc
-#CHECK: vgfmag %v31, %v31, %v31, %v31
+# CHECK: vgfmag %v31, %v31, %v31, %v31
0xe7 0xff 0xf3 0x00 0xff 0xbc
-#CHECK: vgfmah %v0, %v0, %v0, %v0
+# CHECK: vgfmah %v0, %v0, %v0, %v0
0xe7 0x00 0x01 0x00 0x00 0xbc
-#CHECK: vgfmah %v3, %v20, %v5, %v22
+# CHECK: vgfmah %v3, %v20, %v5, %v22
0xe7 0x34 0x51 0x00 0x65 0xbc
-#CHECK: vgfmah %v31, %v31, %v31, %v31
+# CHECK: vgfmah %v31, %v31, %v31, %v31
0xe7 0xff 0xf1 0x00 0xff 0xbc
-#CHECK: vgfm %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0xb4
-
-#CHECK: vgfm %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0xb4
-
-#CHECK: vgfm %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0xb4
-
-#CHECK: vgfmb %v0, %v0, %v0
+# CHECK: vgfmb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xb4
-#CHECK: vgfmb %v18, %v3, %v20
+# CHECK: vgfmb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xb4
-#CHECK: vgfmb %v31, %v31, %v31
+# CHECK: vgfmb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xb4
-#CHECK: vgfmf %v0, %v0, %v0
+# CHECK: vgfmf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xb4
-#CHECK: vgfmf %v18, %v3, %v20
+# CHECK: vgfmf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xb4
-#CHECK: vgfmf %v31, %v31, %v31
+# CHECK: vgfmf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xb4
-#CHECK: vgfmg %v0, %v0, %v0
+# CHECK: vgfmg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xb4
-#CHECK: vgfmg %v18, %v3, %v20
+# CHECK: vgfmg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xb4
-#CHECK: vgfmg %v31, %v31, %v31
+# CHECK: vgfmg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xb4
-#CHECK: vgfmh %v0, %v0, %v0
+# CHECK: vgfmh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xb4
-#CHECK: vgfmh %v18, %v3, %v20
+# CHECK: vgfmh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xb4
-#CHECK: vgfmh %v31, %v31, %v31
+# CHECK: vgfmh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xb4
-#CHECK: vgm %v0, 0, 0, 11
+# CHECK: vgm %v0, 0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x46
-#CHECK: vgm %v22, 55, 66, 11
+# CHECK: vgm %v22, 55, 66, 11
0xe7 0x60 0x37 0x42 0xb8 0x46
-#CHECK: vgm %v31, 255, 255, 11
+# CHECK: vgm %v31, 255, 255, 11
0xe7 0xf0 0xff 0xff 0xb8 0x46
-#CHECK: vgmb %v0, 0, 0
+# CHECK: vgmb %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x46
-#CHECK: vgmb %v22, 55, 66
+# CHECK: vgmb %v22, 55, 66
0xe7 0x60 0x37 0x42 0x08 0x46
-#CHECK: vgmb %v31, 255, 255
+# CHECK: vgmb %v31, 255, 255
0xe7 0xf0 0xff 0xff 0x08 0x46
-#CHECK: vgmf %v0, 0, 0
+# CHECK: vgmf %v0, 0, 0
0xe7 0x00 0x00 0x00 0x20 0x46
-#CHECK: vgmf %v22, 55, 66
+# CHECK: vgmf %v22, 55, 66
0xe7 0x60 0x37 0x42 0x28 0x46
-#CHECK: vgmf %v31, 255, 255
+# CHECK: vgmf %v31, 255, 255
0xe7 0xf0 0xff 0xff 0x28 0x46
-#CHECK: vgmg %v0, 0, 0
+# CHECK: vgmg %v0, 0, 0
0xe7 0x00 0x00 0x00 0x30 0x46
-#CHECK: vgmg %v22, 55, 66
+# CHECK: vgmg %v22, 55, 66
0xe7 0x60 0x37 0x42 0x38 0x46
-#CHECK: vgmg %v31, 255, 255
+# CHECK: vgmg %v31, 255, 255
0xe7 0xf0 0xff 0xff 0x38 0x46
-#CHECK: vgmh %v0, 0, 0
+# CHECK: vgmh %v0, 0, 0
0xe7 0x00 0x00 0x00 0x10 0x46
-#CHECK: vgmh %v22, 55, 66
+# CHECK: vgmh %v22, 55, 66
0xe7 0x60 0x37 0x42 0x18 0x46
-#CHECK: vgmh %v31, 255, 255
+# CHECK: vgmh %v31, 255, 255
0xe7 0xf0 0xff 0xff 0x18 0x46
-#CHECK: vl %v0, 0
+# CHECK: vistr %v0, %v0, 11, 0
+0xe7 0x00 0x00 0x00 0xb0 0x5c
+
+# CHECK: vistr %v0, %v0, 11, 12
+0xe7 0x00 0x00 0xc0 0xb0 0x5c
+
+# CHECK: vistr %v18, %v3, 11, 0
+0xe7 0x23 0x00 0x00 0xb8 0x5c
+
+# CHECK: vistr %v31, %v31, 11, 0
+0xe7 0xff 0x00 0x00 0xbc 0x5c
+
+# CHECK: vistrb %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x00 0x5c
+
+# CHECK: vistrb %v0, %v0, 12
+0xe7 0x00 0x00 0xc0 0x00 0x5c
+
+# CHECK: vistrb %v18, %v3, 0
+0xe7 0x23 0x00 0x00 0x08 0x5c
+
+# CHECK: vistrb %v31, %v31, 0
+0xe7 0xff 0x00 0x00 0x0c 0x5c
+
+# CHECK: vistrbs %v7, %v24
+0xe7 0x78 0x00 0x10 0x04 0x5c
+
+# CHECK: vistrf %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x20 0x5c
+
+# CHECK: vistrf %v0, %v0, 12
+0xe7 0x00 0x00 0xc0 0x20 0x5c
+
+# CHECK: vistrf %v18, %v3, 0
+0xe7 0x23 0x00 0x00 0x28 0x5c
+
+# CHECK: vistrf %v31, %v31, 0
+0xe7 0xff 0x00 0x00 0x2c 0x5c
+
+# CHECK: vistrfs %v7, %v24
+0xe7 0x78 0x00 0x10 0x24 0x5c
+
+# CHECK: vistrh %v0, %v0, 0
+0xe7 0x00 0x00 0x00 0x10 0x5c
+
+# CHECK: vistrh %v0, %v0, 12
+0xe7 0x00 0x00 0xc0 0x10 0x5c
+
+# CHECK: vistrh %v18, %v3, 0
+0xe7 0x23 0x00 0x00 0x18 0x5c
+
+# CHECK: vistrh %v31, %v31, 0
+0xe7 0xff 0x00 0x00 0x1c 0x5c
+
+# CHECK: vistrhs %v7, %v24
+0xe7 0x78 0x00 0x10 0x14 0x5c
+
+# CHECK: vl %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x06
-#CHECK: vl %v17, 2475(%r7,%r8)
+# CHECK: vl %v17, 2475(%r7,%r8)
0xe7 0x17 0x89 0xab 0x08 0x06
-#CHECK: vl %v31, 4095(%r15,%r15)
+# CHECK: vl %v31, 4095(%r15,%r15)
0xe7 0xff 0xff 0xff 0x08 0x06
-#CHECK: vlbb %v0, 0, 0
+# CHECK: vlbb %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x07
-#CHECK: vlbb %v17, 2475(%r7,%r8), 12
+# CHECK: vlbb %v17, 2475(%r7,%r8), 12
0xe7 0x17 0x89 0xab 0xc8 0x07
-#CHECK: vlbb %v31, 4095(%r15,%r15), 15
+# CHECK: vlbb %v31, 4095(%r15,%r15), 15
0xe7 0xff 0xff 0xff 0xf8 0x07
-#CHECK: vlc %v0, %v0, 11
+# CHECK: vlc %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xde
-#CHECK: vlc %v19, %v14, 11
+# CHECK: vlc %v19, %v14, 11
0xe7 0x3e 0x00 0x00 0xb8 0xde
-#CHECK: vlc %v31, %v31, 11
+# CHECK: vlc %v31, %v31, 11
0xe7 0xff 0x00 0x00 0xbc 0xde
-#CHECK: vlcb %v0, %v0
+# CHECK: vlcb %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xde
-#CHECK: vlcb %v19, %v14
+# CHECK: vlcb %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0xde
-#CHECK: vlcb %v31, %v31
+# CHECK: vlcb %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0xde
-#CHECK: vlcf %v0, %v0
+# CHECK: vlcf %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xde
-#CHECK: vlcf %v19, %v14
+# CHECK: vlcf %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0xde
-#CHECK: vlcf %v31, %v31
+# CHECK: vlcf %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0xde
-#CHECK: vlcg %v0, %v0
+# CHECK: vlcg %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xde
-#CHECK: vlcg %v19, %v14
+# CHECK: vlcg %v19, %v14
0xe7 0x3e 0x00 0x00 0x38 0xde
-#CHECK: vlcg %v31, %v31
+# CHECK: vlcg %v31, %v31
0xe7 0xff 0x00 0x00 0x3c 0xde
-#CHECK: vlch %v0, %v0
+# CHECK: vlch %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xde
-#CHECK: vlch %v19, %v14
+# CHECK: vlch %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0xde
-#CHECK: vlch %v31, %v31
+# CHECK: vlch %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0xde
-#CHECK: vlde %v0, %v0, 11, 9
+# CHECK: vlde %v0, %v0, 11, 9
0xe7 0x00 0x00 0x09 0xb0 0xc4
-#CHECK: vlde %v19, %v14, 11, 9
+# CHECK: vlde %v19, %v14, 11, 9
0xe7 0x3e 0x00 0x09 0xb8 0xc4
-#CHECK: vlde %v31, %v31, 11, 9
+# CHECK: vlde %v31, %v31, 11, 9
0xe7 0xff 0x00 0x09 0xbc 0xc4
-#CHECK: vldeb %v0, %v0
+# CHECK: vldeb %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xc4
-#CHECK: vldeb %v19, %v14
+# CHECK: vldeb %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0xc4
-#CHECK: vldeb %v31, %v31
+# CHECK: vldeb %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0xc4
-#CHECK: vleb %v0, 0, 0
+# CHECK: vleb %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x00
-#CHECK: vleb %v17, 2475(%r7,%r8), 12
+# CHECK: vleb %v17, 2475(%r7,%r8), 12
0xe7 0x17 0x89 0xab 0xc8 0x00
-#CHECK: vleb %v31, 4095(%r15,%r15), 15
+# CHECK: vleb %v31, 4095(%r15,%r15), 15
0xe7 0xff 0xff 0xff 0xf8 0x00
-#CHECK: vled %v0, %v0, 11, 0, 0
+# CHECK: vled %v0, %v0, 11, 0, 0
0xe7 0x00 0x00 0x00 0xb0 0xc5
-#CHECK: vled %v19, %v14, 11, 4, 10
+# CHECK: vled %v19, %v14, 11, 4, 10
0xe7 0x3e 0x00 0xa4 0xb8 0xc5
-#CHECK: vled %v31, %v31, 11, 7, 15
+# CHECK: vled %v31, %v31, 11, 7, 15
0xe7 0xff 0x00 0xf7 0xbc 0xc5
-#CHECK: vledb %v0, %v0, 0, 0
+# CHECK: vledb %v0, %v0, 0, 0
0xe7 0x00 0x00 0x00 0x30 0xc5
-#CHECK: vledb %v19, %v14, 4, 10
+# CHECK: vledb %v19, %v14, 4, 10
0xe7 0x3e 0x00 0xa4 0x38 0xc5
-#CHECK: vledb %v31, %v31, 7, 15
+# CHECK: vledb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xf7 0x3c 0xc5
-#CHECK: vlef %v0, 0, 0
+# CHECK: vlef %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x03
-#CHECK: vlef %v17, 2475(%r7,%r8), 2
+# CHECK: vlef %v17, 2475(%r7,%r8), 2
0xe7 0x17 0x89 0xab 0x28 0x03
-#CHECK: vlef %v31, 4095(%r15,%r15), 3
+# CHECK: vlef %v31, 4095(%r15,%r15), 3
0xe7 0xff 0xff 0xff 0x38 0x03
-#CHECK: vleg %v0, 0, 0
+# CHECK: vleg %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x02
-#CHECK: vleg %v17, 2475(%r7,%r8), 1
+# CHECK: vleg %v17, 2475(%r7,%r8), 1
0xe7 0x17 0x89 0xab 0x18 0x02
-#CHECK: vleg %v31, 4095(%r15,%r15), 1
+# CHECK: vleg %v31, 4095(%r15,%r15), 1
0xe7 0xff 0xff 0xff 0x18 0x02
-#CHECK: vleh %v0, 0, 0
+# CHECK: vleh %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x01
-#CHECK: vleh %v17, 2475(%r7,%r8), 5
+# CHECK: vleh %v17, 2475(%r7,%r8), 5
0xe7 0x17 0x89 0xab 0x58 0x01
-#CHECK: vleh %v31, 4095(%r15,%r15), 7
+# CHECK: vleh %v31, 4095(%r15,%r15), 7
0xe7 0xff 0xff 0xff 0x78 0x01
-#CHECK: vleib %v0, 0, 0
+# CHECK: vleib %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x40
-#CHECK: vleib %v23, -30293, 12
+# CHECK: vleib %v23, -30293, 12
0xe7 0x70 0x89 0xab 0xc8 0x40
-#CHECK: vleib %v31, -1, 15
+# CHECK: vleib %v31, -1, 15
0xe7 0xf0 0xff 0xff 0xf8 0x40
-#CHECK: vleif %v0, 0, 0
+# CHECK: vleif %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x43
-#CHECK: vleif %v23, -30293, 2
+# CHECK: vleif %v23, -30293, 2
0xe7 0x70 0x89 0xab 0x28 0x43
-#CHECK: vleif %v31, -1, 3
+# CHECK: vleif %v31, -1, 3
0xe7 0xf0 0xff 0xff 0x38 0x43
-#CHECK: vleig %v0, 0, 0
+# CHECK: vleig %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x42
-#CHECK: vleig %v23, -30293, 1
+# CHECK: vleig %v23, -30293, 1
0xe7 0x70 0x89 0xab 0x18 0x42
-#CHECK: vleig %v31, -1, 1
+# CHECK: vleig %v31, -1, 1
0xe7 0xf0 0xff 0xff 0x18 0x42
-#CHECK: vleih %v0, 0, 0
+# CHECK: vleih %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x41
-#CHECK: vleih %v23, -30293, 5
+# CHECK: vleih %v23, -30293, 5
0xe7 0x70 0x89 0xab 0x58 0x41
-#CHECK: vleih %v31, -1, 7
+# CHECK: vleih %v31, -1, 7
0xe7 0xf0 0xff 0xff 0x78 0x41
-#CHECK: vfpso %v0, %v0, 11, 9, 7
-0xe7 0x00 0x00 0x79 0xb0 0xcc
-
-#CHECK: vfpso %v19, %v14, 11, 9, 7
-0xe7 0x3e 0x00 0x79 0xb8 0xcc
-
-#CHECK: vfpso %v31, %v31, 11, 9, 7
-0xe7 0xff 0x00 0x79 0xbc 0xcc
-
-#CHECK: vfpsodb %v0, %v0, 7
-0xe7 0x00 0x00 0x70 0x30 0xcc
-
-#CHECK: vfpsodb %v19, %v14, 7
-0xe7 0x3e 0x00 0x70 0x38 0xcc
-
-#CHECK: vfpsodb %v31, %v31, 7
-0xe7 0xff 0x00 0x70 0x3c 0xcc
-
-#CHECK: vflcdb %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0xcc
-
-#CHECK: vflcdb %v19, %v14
-0xe7 0x3e 0x00 0x00 0x38 0xcc
-
-#CHECK: vflcdb %v31, %v31
-0xe7 0xff 0x00 0x00 0x3c 0xcc
-
-#CHECK: vflndb %v0, %v0
-0xe7 0x00 0x00 0x10 0x30 0xcc
-
-#CHECK: vflndb %v19, %v14
-0xe7 0x3e 0x00 0x10 0x38 0xcc
-
-#CHECK: vflndb %v31, %v31
-0xe7 0xff 0x00 0x10 0x3c 0xcc
-
-#CHECK: vflpdb %v0, %v0
-0xe7 0x00 0x00 0x20 0x30 0xcc
-
-#CHECK: vflpdb %v19, %v14
-0xe7 0x3e 0x00 0x20 0x38 0xcc
-
-#CHECK: vflpdb %v31, %v31
-0xe7 0xff 0x00 0x20 0x3c 0xcc
-
-#CHECK: vlgv %r0, %v0, 0, 11
+# CHECK: vlgv %r0, %v0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x21
-#CHECK: vlgv %r2, %v19, 1383(%r4), 11
+# CHECK: vlgv %r2, %v19, 1383(%r4), 11
0xe7 0x23 0x45 0x67 0xb4 0x21
-#CHECK: vlgv %r15, %v31, 4095(%r15), 11
+# CHECK: vlgv %r15, %v31, 4095(%r15), 11
0xe7 0xff 0xff 0xff 0xb4 0x21
-#CHECK: vlgvb %r0, %v0, 0
+# CHECK: vlgvb %r0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x21
-#CHECK: vlgvb %r2, %v19, 1383(%r4)
+# CHECK: vlgvb %r2, %v19, 1383(%r4)
0xe7 0x23 0x45 0x67 0x04 0x21
-#CHECK: vlgvb %r15, %v31, 4095(%r15)
+# CHECK: vlgvb %r15, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x04 0x21
-#CHECK: vlgvf %r0, %v0, 0
+# CHECK: vlgvf %r0, %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x21
-#CHECK: vlgvf %r2, %v19, 1383(%r4)
+# CHECK: vlgvf %r2, %v19, 1383(%r4)
0xe7 0x23 0x45 0x67 0x24 0x21
-#CHECK: vlgvf %r15, %v31, 4095(%r15)
+# CHECK: vlgvf %r15, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x24 0x21
-#CHECK: vlgvg %r0, %v0, 0
+# CHECK: vlgvg %r0, %v0, 0
0xe7 0x00 0x00 0x00 0x30 0x21
-#CHECK: vlgvg %r2, %v19, 1383(%r4)
+# CHECK: vlgvg %r2, %v19, 1383(%r4)
0xe7 0x23 0x45 0x67 0x34 0x21
-#CHECK: vlgvg %r15, %v31, 4095(%r15)
+# CHECK: vlgvg %r15, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x34 0x21
-#CHECK: vlgvh %r0, %v0, 0
+# CHECK: vlgvh %r0, %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x21
-#CHECK: vlgvh %r2, %v19, 1383(%r4)
+# CHECK: vlgvh %r2, %v19, 1383(%r4)
0xe7 0x23 0x45 0x67 0x14 0x21
-#CHECK: vlgvh %r15, %v31, 4095(%r15)
+# CHECK: vlgvh %r15, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x14 0x21
-#CHECK: vfsq %v0, %v0, 11, 9
-0xe7 0x00 0x00 0x09 0xb0 0xce
-
-#CHECK: vfsq %v19, %v14, 11, 9
-0xe7 0x3e 0x00 0x09 0xb8 0xce
-
-#CHECK: vfsq %v31, %v31, 11, 9
-0xe7 0xff 0x00 0x09 0xbc 0xce
-
-#CHECK: vfsqdb %v0, %v0
-0xe7 0x00 0x00 0x00 0x30 0xce
-
-#CHECK: vfsqdb %v19, %v14
-0xe7 0x3e 0x00 0x00 0x38 0xce
-
-#CHECK: vfsqdb %v31, %v31
-0xe7 0xff 0x00 0x00 0x3c 0xce
-
-#CHECK: vftci %v0, %v0, 0, 11, 9
-0xe7 0x00 0x00 0x09 0xb0 0x4a
-
-#CHECK: vftci %v19, %v4, 1383, 11, 9
-0xe7 0x34 0x56 0x79 0xb8 0x4a
-
-#CHECK: vftci %v31, %v31, 4095, 11, 9
-0xe7 0xff 0xff 0xf9 0xbc 0x4a
-
-#CHECK: vftcidb %v0, %v0, 0
-0xe7 0x00 0x00 0x00 0x30 0x4a
-
-#CHECK: vftcidb %v19, %v4, 1383
-0xe7 0x34 0x56 0x70 0x38 0x4a
-
-#CHECK: vftcidb %v31, %v31, 4095
-0xe7 0xff 0xff 0xf0 0x3c 0x4a
-
-#CHECK: vll %v0, %r0, 0
+# CHECK: vll %v0, %r0, 0
0xe7 0x00 0x00 0x00 0x00 0x37
-#CHECK: vll %v18, %r3, 1383(%r4)
+# CHECK: vll %v18, %r3, 1383(%r4)
0xe7 0x23 0x45 0x67 0x08 0x37
-#CHECK: vll %v31, %r15, 4095(%r15)
+# CHECK: vll %v31, %r15, 4095(%r15)
0xe7 0xff 0xff 0xff 0x08 0x37
-#CHECK: vllez %v0, 0, 11
+# CHECK: vllez %v0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x04
-#CHECK: vllez %v17, 2475(%r7,%r8), 11
+# CHECK: vllez %v17, 2475(%r7,%r8), 11
0xe7 0x17 0x89 0xab 0xb8 0x04
-#CHECK: vllez %v31, 4095(%r15,%r15), 11
+# CHECK: vllez %v31, 4095(%r15,%r15), 11
0xe7 0xff 0xff 0xff 0xb8 0x04
-#CHECK: vllezb %v0, 0
+# CHECK: vllezb %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x04
-#CHECK: vllezb %v17, 2475(%r7,%r8)
+# CHECK: vllezb %v17, 2475(%r7,%r8)
0xe7 0x17 0x89 0xab 0x08 0x04
-#CHECK: vllezb %v31, 4095(%r15,%r15)
+# CHECK: vllezb %v31, 4095(%r15,%r15)
0xe7 0xff 0xff 0xff 0x08 0x04
-#CHECK: vllezf %v0, 0
+# CHECK: vllezf %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x04
-#CHECK: vllezf %v17, 2475(%r7,%r8)
+# CHECK: vllezf %v17, 2475(%r7,%r8)
0xe7 0x17 0x89 0xab 0x28 0x04
-#CHECK: vllezf %v31, 4095(%r15,%r15)
+# CHECK: vllezf %v31, 4095(%r15,%r15)
0xe7 0xff 0xff 0xff 0x28 0x04
-#CHECK: vllezg %v0, 0
+# CHECK: vllezg %v0, 0
0xe7 0x00 0x00 0x00 0x30 0x04
-#CHECK: vllezg %v17, 2475(%r7,%r8)
+# CHECK: vllezg %v17, 2475(%r7,%r8)
0xe7 0x17 0x89 0xab 0x38 0x04
-#CHECK: vllezg %v31, 4095(%r15,%r15)
+# CHECK: vllezg %v31, 4095(%r15,%r15)
0xe7 0xff 0xff 0xff 0x38 0x04
-#CHECK: vllezh %v0, 0
+# CHECK: vllezh %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x04
-#CHECK: vllezh %v17, 2475(%r7,%r8)
+# CHECK: vllezh %v17, 2475(%r7,%r8)
0xe7 0x17 0x89 0xab 0x18 0x04
-#CHECK: vllezh %v31, 4095(%r15,%r15)
+# CHECK: vllezh %v31, 4095(%r15,%r15)
0xe7 0xff 0xff 0xff 0x18 0x04
-#CHECK: vlm %v0, %v0, 0
+# CHECK: vlm %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x36
-#CHECK: vlm %v12, %v18, 1110(%r3)
+# CHECK: vlm %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x04 0x36
-#CHECK: vlm %v31, %v31, 4095(%r15)
+# CHECK: vlm %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x0c 0x36
-#CHECK: vlp %v0, %v0, 11
+# CHECK: vlp %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xdf
-#CHECK: vlp %v19, %v14, 11
+# CHECK: vlp %v19, %v14, 11
0xe7 0x3e 0x00 0x00 0xb8 0xdf
-#CHECK: vlp %v31, %v31, 11
+# CHECK: vlp %v31, %v31, 11
0xe7 0xff 0x00 0x00 0xbc 0xdf
-#CHECK: vlpb %v0, %v0
+# CHECK: vlpb %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xdf
-#CHECK: vlpb %v19, %v14
+# CHECK: vlpb %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0xdf
-#CHECK: vlpb %v31, %v31
+# CHECK: vlpb %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0xdf
-#CHECK: vlpf %v0, %v0
+# CHECK: vlpf %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xdf
-#CHECK: vlpf %v19, %v14
+# CHECK: vlpf %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0xdf
-#CHECK: vlpf %v31, %v31
+# CHECK: vlpf %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0xdf
-#CHECK: vlpg %v0, %v0
+# CHECK: vlpg %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xdf
-#CHECK: vlpg %v19, %v14
+# CHECK: vlpg %v19, %v14
0xe7 0x3e 0x00 0x00 0x38 0xdf
-#CHECK: vlpg %v31, %v31
+# CHECK: vlpg %v31, %v31
0xe7 0xff 0x00 0x00 0x3c 0xdf
-#CHECK: vlph %v0, %v0
+# CHECK: vlph %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xdf
-#CHECK: vlph %v19, %v14
+# CHECK: vlph %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0xdf
-#CHECK: vlph %v31, %v31
+# CHECK: vlph %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0xdf
-#CHECK: vlr %v0, %v0
+# CHECK: vlr %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x56
-#CHECK: vlr %v19, %v14
+# CHECK: vlr %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0x56
-#CHECK: vlr %v31, %v31
+# CHECK: vlr %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0x56
-#CHECK: vlrep %v0, 0, 11
+# CHECK: vlrep %v0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x05
-#CHECK: vlrep %v17, 2475(%r7,%r8), 11
+# CHECK: vlrep %v17, 2475(%r7,%r8), 11
0xe7 0x17 0x89 0xab 0xb8 0x05
-#CHECK: vlrep %v31, 4095(%r15,%r15), 11
+# CHECK: vlrep %v31, 4095(%r15,%r15), 11
0xe7 0xff 0xff 0xff 0xb8 0x05
-#CHECK: vlrepb %v0, 0
+# CHECK: vlrepb %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x05
-#CHECK: vlrepb %v17, 2475(%r7,%r8)
+# CHECK: vlrepb %v17, 2475(%r7,%r8)
0xe7 0x17 0x89 0xab 0x08 0x05
-#CHECK: vlrepb %v31, 4095(%r15,%r15)
+# CHECK: vlrepb %v31, 4095(%r15,%r15)
0xe7 0xff 0xff 0xff 0x08 0x05
-#CHECK: vlrepf %v0, 0
+# CHECK: vlrepf %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x05
-#CHECK: vlrepf %v17, 2475(%r7,%r8)
+# CHECK: vlrepf %v17, 2475(%r7,%r8)
0xe7 0x17 0x89 0xab 0x28 0x05
-#CHECK: vlrepf %v31, 4095(%r15,%r15)
+# CHECK: vlrepf %v31, 4095(%r15,%r15)
0xe7 0xff 0xff 0xff 0x28 0x05
-#CHECK: vlrepg %v0, 0
+# CHECK: vlrepg %v0, 0
0xe7 0x00 0x00 0x00 0x30 0x05
-#CHECK: vlrepg %v17, 2475(%r7,%r8)
+# CHECK: vlrepg %v17, 2475(%r7,%r8)
0xe7 0x17 0x89 0xab 0x38 0x05
-#CHECK: vlrepg %v31, 4095(%r15,%r15)
+# CHECK: vlrepg %v31, 4095(%r15,%r15)
0xe7 0xff 0xff 0xff 0x38 0x05
-#CHECK: vlreph %v0, 0
+# CHECK: vlreph %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x05
-#CHECK: vlreph %v17, 2475(%r7,%r8)
+# CHECK: vlreph %v17, 2475(%r7,%r8)
0xe7 0x17 0x89 0xab 0x18 0x05
-#CHECK: vlreph %v31, 4095(%r15,%r15)
+# CHECK: vlreph %v31, 4095(%r15,%r15)
0xe7 0xff 0xff 0xff 0x18 0x05
-#CHECK: vlvg %v0, %r0, 0, 11
+# CHECK: vlvg %v0, %r0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x22
-#CHECK: vlvg %v18, %r3, 1383(%r4), 11
+# CHECK: vlvg %v18, %r3, 1383(%r4), 11
0xe7 0x23 0x45 0x67 0xb8 0x22
-#CHECK: vlvg %v31, %r15, 4095(%r15), 11
+# CHECK: vlvg %v31, %r15, 4095(%r15), 11
0xe7 0xff 0xff 0xff 0xb8 0x22
-#CHECK: vlvgb %v0, %r0, 0
+# CHECK: vlvgb %v0, %r0, 0
0xe7 0x00 0x00 0x00 0x00 0x22
-#CHECK: vlvgb %v18, %r3, 1383(%r4)
+# CHECK: vlvgb %v18, %r3, 1383(%r4)
0xe7 0x23 0x45 0x67 0x08 0x22
-#CHECK: vlvgb %v31, %r15, 4095(%r15)
+# CHECK: vlvgb %v31, %r15, 4095(%r15)
0xe7 0xff 0xff 0xff 0x08 0x22
-#CHECK: vlvgf %v0, %r0, 0
+# CHECK: vlvgf %v0, %r0, 0
0xe7 0x00 0x00 0x00 0x20 0x22
-#CHECK: vlvgf %v18, %r3, 1383(%r4)
+# CHECK: vlvgf %v18, %r3, 1383(%r4)
0xe7 0x23 0x45 0x67 0x28 0x22
-#CHECK: vlvgf %v31, %r15, 4095(%r15)
+# CHECK: vlvgf %v31, %r15, 4095(%r15)
0xe7 0xff 0xff 0xff 0x28 0x22
-#CHECK: vlvgg %v0, %r0, 0
+# CHECK: vlvgg %v0, %r0, 0
0xe7 0x00 0x00 0x00 0x30 0x22
-#CHECK: vlvgg %v18, %r3, 1383(%r4)
+# CHECK: vlvgg %v18, %r3, 1383(%r4)
0xe7 0x23 0x45 0x67 0x38 0x22
-#CHECK: vlvgg %v31, %r15, 4095(%r15)
+# CHECK: vlvgg %v31, %r15, 4095(%r15)
0xe7 0xff 0xff 0xff 0x38 0x22
-#CHECK: vlvgh %v0, %r0, 0
+# CHECK: vlvgh %v0, %r0, 0
0xe7 0x00 0x00 0x00 0x10 0x22
-#CHECK: vlvgh %v18, %r3, 1383(%r4)
+# CHECK: vlvgh %v18, %r3, 1383(%r4)
0xe7 0x23 0x45 0x67 0x18 0x22
-#CHECK: vlvgh %v31, %r15, 4095(%r15)
+# CHECK: vlvgh %v31, %r15, 4095(%r15)
0xe7 0xff 0xff 0xff 0x18 0x22
-#CHECK: vlvgp %v0, %r0, %r0
+# CHECK: vlvgp %v0, %r0, %r0
0xe7 0x00 0x00 0x00 0x00 0x62
-#CHECK: vlvgp %v18, %r3, %r4
+# CHECK: vlvgp %v18, %r3, %r4
0xe7 0x23 0x40 0x00 0x08 0x62
-#CHECK: vlvgp %v31, %r15, %r15
+# CHECK: vlvgp %v31, %r15, %r15
0xe7 0xff 0xf0 0x00 0x08 0x62
-#CHECK: vmae %v0, %v0, %v0, %v0, 11
+# CHECK: vmae %v0, %v0, %v0, %v0, 11
0xe7 0x00 0x0b 0x00 0x00 0xae
-#CHECK: vmae %v3, %v20, %v5, %v22, 11
+# CHECK: vmae %v3, %v20, %v5, %v22, 11
0xe7 0x34 0x5b 0x00 0x65 0xae
-#CHECK: vmae %v31, %v31, %v31, %v31, 11
+# CHECK: vmae %v31, %v31, %v31, %v31, 11
0xe7 0xff 0xfb 0x00 0xff 0xae
-#CHECK: vmaeb %v0, %v0, %v0, %v0
+# CHECK: vmaeb %v0, %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xae
-#CHECK: vmaeb %v3, %v20, %v5, %v22
+# CHECK: vmaeb %v3, %v20, %v5, %v22
0xe7 0x34 0x50 0x00 0x65 0xae
-#CHECK: vmaeb %v31, %v31, %v31, %v31
+# CHECK: vmaeb %v31, %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0xff 0xae
-#CHECK: vmaef %v0, %v0, %v0, %v0
+# CHECK: vmaef %v0, %v0, %v0, %v0
0xe7 0x00 0x02 0x00 0x00 0xae
-#CHECK: vmaef %v3, %v20, %v5, %v22
+# CHECK: vmaef %v3, %v20, %v5, %v22
0xe7 0x34 0x52 0x00 0x65 0xae
-#CHECK: vmaef %v31, %v31, %v31, %v31
+# CHECK: vmaef %v31, %v31, %v31, %v31
0xe7 0xff 0xf2 0x00 0xff 0xae
-#CHECK: vmaeh %v0, %v0, %v0, %v0
+# CHECK: vmaeh %v0, %v0, %v0, %v0
0xe7 0x00 0x01 0x00 0x00 0xae
-#CHECK: vmaeh %v3, %v20, %v5, %v22
+# CHECK: vmaeh %v3, %v20, %v5, %v22
0xe7 0x34 0x51 0x00 0x65 0xae
-#CHECK: vmaeh %v31, %v31, %v31, %v31
+# CHECK: vmaeh %v31, %v31, %v31, %v31
0xe7 0xff 0xf1 0x00 0xff 0xae
-#CHECK: vmah %v0, %v0, %v0, %v0, 11
+# CHECK: vmah %v0, %v0, %v0, %v0, 11
0xe7 0x00 0x0b 0x00 0x00 0xab
-#CHECK: vmah %v3, %v20, %v5, %v22, 11
+# CHECK: vmah %v3, %v20, %v5, %v22, 11
0xe7 0x34 0x5b 0x00 0x65 0xab
-#CHECK: vmah %v31, %v31, %v31, %v31, 11
+# CHECK: vmah %v31, %v31, %v31, %v31, 11
0xe7 0xff 0xfb 0x00 0xff 0xab
-#CHECK: vmahb %v0, %v0, %v0, %v0
+# CHECK: vmahb %v0, %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xab
-#CHECK: vmahb %v3, %v20, %v5, %v22
+# CHECK: vmahb %v3, %v20, %v5, %v22
0xe7 0x34 0x50 0x00 0x65 0xab
-#CHECK: vmahb %v31, %v31, %v31, %v31
+# CHECK: vmahb %v31, %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0xff 0xab
-#CHECK: vmahf %v0, %v0, %v0, %v0
+# CHECK: vmahf %v0, %v0, %v0, %v0
0xe7 0x00 0x02 0x00 0x00 0xab
-#CHECK: vmahf %v3, %v20, %v5, %v22
+# CHECK: vmahf %v3, %v20, %v5, %v22
0xe7 0x34 0x52 0x00 0x65 0xab
-#CHECK: vmahf %v31, %v31, %v31, %v31
+# CHECK: vmahf %v31, %v31, %v31, %v31
0xe7 0xff 0xf2 0x00 0xff 0xab
-#CHECK: vmahh %v0, %v0, %v0, %v0
+# CHECK: vmahh %v0, %v0, %v0, %v0
0xe7 0x00 0x01 0x00 0x00 0xab
-#CHECK: vmahh %v3, %v20, %v5, %v22
+# CHECK: vmahh %v3, %v20, %v5, %v22
0xe7 0x34 0x51 0x00 0x65 0xab
-#CHECK: vmahh %v31, %v31, %v31, %v31
+# CHECK: vmahh %v31, %v31, %v31, %v31
0xe7 0xff 0xf1 0x00 0xff 0xab
-#CHECK: vmal %v0, %v0, %v0, %v0, 11
+# CHECK: vmal %v0, %v0, %v0, %v0, 11
0xe7 0x00 0x0b 0x00 0x00 0xaa
-#CHECK: vmal %v3, %v20, %v5, %v22, 11
+# CHECK: vmal %v3, %v20, %v5, %v22, 11
0xe7 0x34 0x5b 0x00 0x65 0xaa
-#CHECK: vmal %v31, %v31, %v31, %v31, 11
+# CHECK: vmal %v31, %v31, %v31, %v31, 11
0xe7 0xff 0xfb 0x00 0xff 0xaa
-#CHECK: vmalb %v0, %v0, %v0, %v0
+# CHECK: vmalb %v0, %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xaa
-#CHECK: vmalb %v3, %v20, %v5, %v22
+# CHECK: vmalb %v3, %v20, %v5, %v22
0xe7 0x34 0x50 0x00 0x65 0xaa
-#CHECK: vmalb %v31, %v31, %v31, %v31
+# CHECK: vmalb %v31, %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0xff 0xaa
-#CHECK: vmale %v0, %v0, %v0, %v0, 11
+# CHECK: vmale %v0, %v0, %v0, %v0, 11
0xe7 0x00 0x0b 0x00 0x00 0xac
-#CHECK: vmale %v3, %v20, %v5, %v22, 11
+# CHECK: vmale %v3, %v20, %v5, %v22, 11
0xe7 0x34 0x5b 0x00 0x65 0xac
-#CHECK: vmale %v31, %v31, %v31, %v31, 11
+# CHECK: vmale %v31, %v31, %v31, %v31, 11
0xe7 0xff 0xfb 0x00 0xff 0xac
-#CHECK: vmaleb %v0, %v0, %v0, %v0
+# CHECK: vmaleb %v0, %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xac
-#CHECK: vmaleb %v3, %v20, %v5, %v22
+# CHECK: vmaleb %v3, %v20, %v5, %v22
0xe7 0x34 0x50 0x00 0x65 0xac
-#CHECK: vmaleb %v31, %v31, %v31, %v31
+# CHECK: vmaleb %v31, %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0xff 0xac
-#CHECK: vmalef %v0, %v0, %v0, %v0
+# CHECK: vmalef %v0, %v0, %v0, %v0
0xe7 0x00 0x02 0x00 0x00 0xac
-#CHECK: vmalef %v3, %v20, %v5, %v22
+# CHECK: vmalef %v3, %v20, %v5, %v22
0xe7 0x34 0x52 0x00 0x65 0xac
-#CHECK: vmalef %v31, %v31, %v31, %v31
+# CHECK: vmalef %v31, %v31, %v31, %v31
0xe7 0xff 0xf2 0x00 0xff 0xac
-#CHECK: vmaleh %v0, %v0, %v0, %v0
+# CHECK: vmaleh %v0, %v0, %v0, %v0
0xe7 0x00 0x01 0x00 0x00 0xac
-#CHECK: vmaleh %v3, %v20, %v5, %v22
+# CHECK: vmaleh %v3, %v20, %v5, %v22
0xe7 0x34 0x51 0x00 0x65 0xac
-#CHECK: vmaleh %v31, %v31, %v31, %v31
+# CHECK: vmaleh %v31, %v31, %v31, %v31
0xe7 0xff 0xf1 0x00 0xff 0xac
-#CHECK: vmalf %v0, %v0, %v0, %v0
+# CHECK: vmalf %v0, %v0, %v0, %v0
0xe7 0x00 0x02 0x00 0x00 0xaa
-#CHECK: vmalf %v3, %v20, %v5, %v22
+# CHECK: vmalf %v3, %v20, %v5, %v22
0xe7 0x34 0x52 0x00 0x65 0xaa
-#CHECK: vmalf %v31, %v31, %v31, %v31
+# CHECK: vmalf %v31, %v31, %v31, %v31
0xe7 0xff 0xf2 0x00 0xff 0xaa
-#CHECK: vmalh %v0, %v0, %v0, %v0, 11
+# CHECK: vmalh %v0, %v0, %v0, %v0, 11
0xe7 0x00 0x0b 0x00 0x00 0xa9
-#CHECK: vmalh %v3, %v20, %v5, %v22, 11
+# CHECK: vmalh %v3, %v20, %v5, %v22, 11
0xe7 0x34 0x5b 0x00 0x65 0xa9
-#CHECK: vmalh %v31, %v31, %v31, %v31, 11
+# CHECK: vmalh %v31, %v31, %v31, %v31, 11
0xe7 0xff 0xfb 0x00 0xff 0xa9
-#CHECK: vmalhb %v0, %v0, %v0, %v0
+# CHECK: vmalhb %v0, %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xa9
-#CHECK: vmalhb %v3, %v20, %v5, %v22
+# CHECK: vmalhb %v3, %v20, %v5, %v22
0xe7 0x34 0x50 0x00 0x65 0xa9
-#CHECK: vmalhb %v31, %v31, %v31, %v31
+# CHECK: vmalhb %v31, %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0xff 0xa9
-#CHECK: vmalhf %v0, %v0, %v0, %v0
+# CHECK: vmalhf %v0, %v0, %v0, %v0
0xe7 0x00 0x02 0x00 0x00 0xa9
-#CHECK: vmalhf %v3, %v20, %v5, %v22
+# CHECK: vmalhf %v3, %v20, %v5, %v22
0xe7 0x34 0x52 0x00 0x65 0xa9
-#CHECK: vmalhf %v31, %v31, %v31, %v31
+# CHECK: vmalhf %v31, %v31, %v31, %v31
0xe7 0xff 0xf2 0x00 0xff 0xa9
-#CHECK: vmalhh %v0, %v0, %v0, %v0
+# CHECK: vmalhh %v0, %v0, %v0, %v0
0xe7 0x00 0x01 0x00 0x00 0xa9
-#CHECK: vmalhh %v3, %v20, %v5, %v22
+# CHECK: vmalhh %v3, %v20, %v5, %v22
0xe7 0x34 0x51 0x00 0x65 0xa9
-#CHECK: vmalhh %v31, %v31, %v31, %v31
+# CHECK: vmalhh %v31, %v31, %v31, %v31
0xe7 0xff 0xf1 0x00 0xff 0xa9
-#CHECK: vmalhw %v0, %v0, %v0, %v0
+# CHECK: vmalhw %v0, %v0, %v0, %v0
0xe7 0x00 0x01 0x00 0x00 0xaa
-#CHECK: vmalhw %v3, %v20, %v5, %v22
+# CHECK: vmalhw %v3, %v20, %v5, %v22
0xe7 0x34 0x51 0x00 0x65 0xaa
-#CHECK: vmalhw %v31, %v31, %v31, %v31
+# CHECK: vmalhw %v31, %v31, %v31, %v31
0xe7 0xff 0xf1 0x00 0xff 0xaa
-#CHECK: vmalo %v0, %v0, %v0, %v0, 11
+# CHECK: vmalo %v0, %v0, %v0, %v0, 11
0xe7 0x00 0x0b 0x00 0x00 0xad
-#CHECK: vmalo %v3, %v20, %v5, %v22, 11
+# CHECK: vmalo %v3, %v20, %v5, %v22, 11
0xe7 0x34 0x5b 0x00 0x65 0xad
-#CHECK: vmalo %v31, %v31, %v31, %v31, 11
+# CHECK: vmalo %v31, %v31, %v31, %v31, 11
0xe7 0xff 0xfb 0x00 0xff 0xad
-#CHECK: vmalob %v0, %v0, %v0, %v0
+# CHECK: vmalob %v0, %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xad
-#CHECK: vmalob %v3, %v20, %v5, %v22
+# CHECK: vmalob %v3, %v20, %v5, %v22
0xe7 0x34 0x50 0x00 0x65 0xad
-#CHECK: vmalob %v31, %v31, %v31, %v31
+# CHECK: vmalob %v31, %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0xff 0xad
-#CHECK: vmalof %v0, %v0, %v0, %v0
+# CHECK: vmalof %v0, %v0, %v0, %v0
0xe7 0x00 0x02 0x00 0x00 0xad
-#CHECK: vmalof %v3, %v20, %v5, %v22
+# CHECK: vmalof %v3, %v20, %v5, %v22
0xe7 0x34 0x52 0x00 0x65 0xad
-#CHECK: vmalof %v31, %v31, %v31, %v31
+# CHECK: vmalof %v31, %v31, %v31, %v31
0xe7 0xff 0xf2 0x00 0xff 0xad
-#CHECK: vmaloh %v0, %v0, %v0, %v0
+# CHECK: vmaloh %v0, %v0, %v0, %v0
0xe7 0x00 0x01 0x00 0x00 0xad
-#CHECK: vmaloh %v3, %v20, %v5, %v22
+# CHECK: vmaloh %v3, %v20, %v5, %v22
0xe7 0x34 0x51 0x00 0x65 0xad
-#CHECK: vmaloh %v31, %v31, %v31, %v31
+# CHECK: vmaloh %v31, %v31, %v31, %v31
0xe7 0xff 0xf1 0x00 0xff 0xad
-#CHECK: vmao %v0, %v0, %v0, %v0, 11
+# CHECK: vmao %v0, %v0, %v0, %v0, 11
0xe7 0x00 0x0b 0x00 0x00 0xaf
-#CHECK: vmao %v3, %v20, %v5, %v22, 11
+# CHECK: vmao %v3, %v20, %v5, %v22, 11
0xe7 0x34 0x5b 0x00 0x65 0xaf
-#CHECK: vmao %v31, %v31, %v31, %v31, 11
+# CHECK: vmao %v31, %v31, %v31, %v31, 11
0xe7 0xff 0xfb 0x00 0xff 0xaf
-#CHECK: vmaob %v0, %v0, %v0, %v0
+# CHECK: vmaob %v0, %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xaf
-#CHECK: vmaob %v3, %v20, %v5, %v22
+# CHECK: vmaob %v3, %v20, %v5, %v22
0xe7 0x34 0x50 0x00 0x65 0xaf
-#CHECK: vmaob %v31, %v31, %v31, %v31
+# CHECK: vmaob %v31, %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0xff 0xaf
-#CHECK: vmaof %v0, %v0, %v0, %v0
+# CHECK: vmaof %v0, %v0, %v0, %v0
0xe7 0x00 0x02 0x00 0x00 0xaf
-#CHECK: vmaof %v3, %v20, %v5, %v22
+# CHECK: vmaof %v3, %v20, %v5, %v22
0xe7 0x34 0x52 0x00 0x65 0xaf
-#CHECK: vmaof %v31, %v31, %v31, %v31
+# CHECK: vmaof %v31, %v31, %v31, %v31
0xe7 0xff 0xf2 0x00 0xff 0xaf
-#CHECK: vmaoh %v0, %v0, %v0, %v0
+# CHECK: vmaoh %v0, %v0, %v0, %v0
0xe7 0x00 0x01 0x00 0x00 0xaf
-#CHECK: vmaoh %v3, %v20, %v5, %v22
+# CHECK: vmaoh %v3, %v20, %v5, %v22
0xe7 0x34 0x51 0x00 0x65 0xaf
-#CHECK: vmaoh %v31, %v31, %v31, %v31
+# CHECK: vmaoh %v31, %v31, %v31, %v31
0xe7 0xff 0xf1 0x00 0xff 0xaf
-#CHECK: vme %v0, %v0, %v0, 11
+# CHECK: vme %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xa6
-#CHECK: vme %v18, %v3, %v20, 11
+# CHECK: vme %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xa6
-#CHECK: vme %v31, %v31, %v31, 11
+# CHECK: vme %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xa6
-#CHECK: vmeb %v0, %v0, %v0
+# CHECK: vmeb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xa6
-#CHECK: vmeb %v18, %v3, %v20
+# CHECK: vmeb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xa6
-#CHECK: vmeb %v31, %v31, %v31
+# CHECK: vmeb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xa6
-#CHECK: vmef %v0, %v0, %v0
+# CHECK: vmef %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xa6
-#CHECK: vmef %v18, %v3, %v20
+# CHECK: vmef %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xa6
-#CHECK: vmef %v31, %v31, %v31
+# CHECK: vmef %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xa6
-#CHECK: vmeh %v0, %v0, %v0
+# CHECK: vmeh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xa6
-#CHECK: vmeh %v18, %v3, %v20
+# CHECK: vmeh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xa6
-#CHECK: vmeh %v31, %v31, %v31
+# CHECK: vmeh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xa6
-#CHECK: vmh %v0, %v0, %v0, 11
+# CHECK: vmh %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xa3
-#CHECK: vmh %v18, %v3, %v20, 11
+# CHECK: vmh %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xa3
-#CHECK: vmh %v31, %v31, %v31, 11
+# CHECK: vmh %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xa3
-#CHECK: vmhb %v0, %v0, %v0
+# CHECK: vmhb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xa3
-#CHECK: vmhb %v18, %v3, %v20
+# CHECK: vmhb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xa3
-#CHECK: vmhb %v31, %v31, %v31
+# CHECK: vmhb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xa3
-#CHECK: vmhf %v0, %v0, %v0
+# CHECK: vmhf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xa3
-#CHECK: vmhf %v18, %v3, %v20
+# CHECK: vmhf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xa3
-#CHECK: vmhf %v31, %v31, %v31
+# CHECK: vmhf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xa3
-#CHECK: vmhh %v0, %v0, %v0
+# CHECK: vmhh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xa3
-#CHECK: vmhh %v18, %v3, %v20
+# CHECK: vmhh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xa3
-#CHECK: vmhh %v31, %v31, %v31
+# CHECK: vmhh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xa3
-#CHECK: vml %v0, %v0, %v0, 11
+# CHECK: vml %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xa2
-#CHECK: vml %v18, %v3, %v20, 11
+# CHECK: vml %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xa2
-#CHECK: vml %v31, %v31, %v31, 11
+# CHECK: vml %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xa2
-#CHECK: vmlb %v0, %v0, %v0
+# CHECK: vmlb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xa2
-#CHECK: vmlb %v18, %v3, %v20
+# CHECK: vmlb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xa2
-#CHECK: vmlb %v31, %v31, %v31
+# CHECK: vmlb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xa2
-#CHECK: vmlf %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0xa2
-
-#CHECK: vmlf %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0xa2
-
-#CHECK: vmlf %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0xa2
-
-#CHECK: vmle %v0, %v0, %v0, 11
+# CHECK: vmle %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xa4
-#CHECK: vmle %v18, %v3, %v20, 11
+# CHECK: vmle %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xa4
-#CHECK: vmle %v31, %v31, %v31, 11
+# CHECK: vmle %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xa4
-#CHECK: vmleb %v0, %v0, %v0
+# CHECK: vmleb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xa4
-#CHECK: vmleb %v18, %v3, %v20
+# CHECK: vmleb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xa4
-#CHECK: vmleb %v31, %v31, %v31
+# CHECK: vmleb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xa4
-#CHECK: vmlef %v0, %v0, %v0
+# CHECK: vmlef %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xa4
-#CHECK: vmlef %v18, %v3, %v20
+# CHECK: vmlef %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xa4
-#CHECK: vmlef %v31, %v31, %v31
+# CHECK: vmlef %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xa4
-#CHECK: vmleh %v0, %v0, %v0
+# CHECK: vmleh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xa4
-#CHECK: vmleh %v18, %v3, %v20
+# CHECK: vmleh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xa4
-#CHECK: vmleh %v31, %v31, %v31
+# CHECK: vmleh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xa4
-#CHECK: vmlh %v0, %v0, %v0, 11
+# CHECK: vmlf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xa2
+
+# CHECK: vmlf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0xa2
+
+# CHECK: vmlf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0xa2
+
+# CHECK: vmlh %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xa1
-#CHECK: vmlh %v18, %v3, %v20, 11
+# CHECK: vmlh %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xa1
-#CHECK: vmlh %v31, %v31, %v31, 11
+# CHECK: vmlh %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xa1
-#CHECK: vmlhb %v0, %v0, %v0
+# CHECK: vmlhb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xa1
-#CHECK: vmlhb %v18, %v3, %v20
+# CHECK: vmlhb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xa1
-#CHECK: vmlhb %v31, %v31, %v31
+# CHECK: vmlhb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xa1
-#CHECK: vmlhf %v0, %v0, %v0
+# CHECK: vmlhf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xa1
-#CHECK: vmlhf %v18, %v3, %v20
+# CHECK: vmlhf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xa1
-#CHECK: vmlhf %v31, %v31, %v31
+# CHECK: vmlhf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xa1
-#CHECK: vmlhh %v0, %v0, %v0
+# CHECK: vmlhh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xa1
-#CHECK: vmlhh %v18, %v3, %v20
+# CHECK: vmlhh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xa1
-#CHECK: vmlhh %v31, %v31, %v31
+# CHECK: vmlhh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xa1
-#CHECK: vmlhw %v0, %v0, %v0
+# CHECK: vmlhw %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xa2
-#CHECK: vmlhw %v18, %v3, %v20
+# CHECK: vmlhw %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xa2
-#CHECK: vmlhw %v31, %v31, %v31
+# CHECK: vmlhw %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xa2
-#CHECK: vmlo %v0, %v0, %v0, 11
+# CHECK: vmlo %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xa5
-#CHECK: vmlo %v18, %v3, %v20, 11
+# CHECK: vmlo %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xa5
-#CHECK: vmlo %v31, %v31, %v31, 11
+# CHECK: vmlo %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xa5
-#CHECK: vmlob %v0, %v0, %v0
+# CHECK: vmlob %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xa5
-#CHECK: vmlob %v18, %v3, %v20
+# CHECK: vmlob %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xa5
-#CHECK: vmlob %v31, %v31, %v31
+# CHECK: vmlob %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xa5
-#CHECK: vmlof %v0, %v0, %v0
+# CHECK: vmlof %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xa5
-#CHECK: vmlof %v18, %v3, %v20
+# CHECK: vmlof %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xa5
-#CHECK: vmlof %v31, %v31, %v31
+# CHECK: vmlof %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xa5
-#CHECK: vmloh %v0, %v0, %v0
+# CHECK: vmloh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xa5
-#CHECK: vmloh %v18, %v3, %v20
+# CHECK: vmloh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xa5
-#CHECK: vmloh %v31, %v31, %v31
+# CHECK: vmloh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xa5
-#CHECK: vmn %v0, %v0, %v0, 11
+# CHECK: vmn %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xfe
-#CHECK: vmn %v18, %v3, %v20, 11
+# CHECK: vmn %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xfe
-#CHECK: vmn %v31, %v31, %v31, 11
+# CHECK: vmn %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xfe
-#CHECK: vmnb %v0, %v0, %v0
+# CHECK: vmnb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xfe
-#CHECK: vmnb %v18, %v3, %v20
+# CHECK: vmnb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xfe
-#CHECK: vmnb %v31, %v31, %v31
+# CHECK: vmnb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xfe
-#CHECK: vmnf %v0, %v0, %v0
+# CHECK: vmnf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xfe
-#CHECK: vmnf %v18, %v3, %v20
+# CHECK: vmnf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xfe
-#CHECK: vmnf %v31, %v31, %v31
+# CHECK: vmnf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xfe
-#CHECK: vmng %v0, %v0, %v0
+# CHECK: vmng %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xfe
-#CHECK: vmng %v18, %v3, %v20
+# CHECK: vmng %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xfe
-#CHECK: vmng %v31, %v31, %v31
+# CHECK: vmng %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xfe
-#CHECK: vmnh %v0, %v0, %v0
+# CHECK: vmnh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xfe
-#CHECK: vmnh %v18, %v3, %v20
+# CHECK: vmnh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xfe
-#CHECK: vmnh %v31, %v31, %v31
+# CHECK: vmnh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xfe
-#CHECK: vmnl %v0, %v0, %v0, 11
+# CHECK: vmnl %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xfc
-#CHECK: vmnl %v18, %v3, %v20, 11
+# CHECK: vmnl %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xfc
-#CHECK: vmnl %v31, %v31, %v31, 11
+# CHECK: vmnl %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xfc
-#CHECK: vmnlb %v0, %v0, %v0
+# CHECK: vmnlb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xfc
-#CHECK: vmnlb %v18, %v3, %v20
+# CHECK: vmnlb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xfc
-#CHECK: vmnlb %v31, %v31, %v31
+# CHECK: vmnlb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xfc
-#CHECK: vmnlf %v0, %v0, %v0
+# CHECK: vmnlf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xfc
-#CHECK: vmnlf %v18, %v3, %v20
+# CHECK: vmnlf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xfc
-#CHECK: vmnlf %v31, %v31, %v31
+# CHECK: vmnlf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xfc
-#CHECK: vmnlg %v0, %v0, %v0
+# CHECK: vmnlg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xfc
-#CHECK: vmnlg %v18, %v3, %v20
+# CHECK: vmnlg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xfc
-#CHECK: vmnlg %v31, %v31, %v31
+# CHECK: vmnlg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xfc
-#CHECK: vmnlh %v0, %v0, %v0
+# CHECK: vmnlh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xfc
-#CHECK: vmnlh %v18, %v3, %v20
+# CHECK: vmnlh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xfc
-#CHECK: vmnlh %v31, %v31, %v31
+# CHECK: vmnlh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xfc
-#CHECK: vmo %v0, %v0, %v0, 11
+# CHECK: vmo %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xa7
-#CHECK: vmo %v18, %v3, %v20, 11
+# CHECK: vmo %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xa7
-#CHECK: vmo %v31, %v31, %v31, 11
+# CHECK: vmo %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xa7
-#CHECK: vmob %v0, %v0, %v0
+# CHECK: vmob %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xa7
-#CHECK: vmob %v18, %v3, %v20
+# CHECK: vmob %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xa7
-#CHECK: vmob %v31, %v31, %v31
+# CHECK: vmob %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xa7
-#CHECK: vmof %v0, %v0, %v0
+# CHECK: vmof %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xa7
-#CHECK: vmof %v18, %v3, %v20
+# CHECK: vmof %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xa7
-#CHECK: vmof %v31, %v31, %v31
+# CHECK: vmof %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xa7
-#CHECK: vmoh %v0, %v0, %v0
+# CHECK: vmoh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xa7
-#CHECK: vmoh %v18, %v3, %v20
+# CHECK: vmoh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xa7
-#CHECK: vmoh %v31, %v31, %v31
+# CHECK: vmoh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xa7
-#CHECK: vmrh %v0, %v0, %v0, 11
+# CHECK: vmrh %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0x61
-#CHECK: vmrh %v18, %v3, %v20, 11
+# CHECK: vmrh %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0x61
-#CHECK: vmrh %v31, %v31, %v31, 11
+# CHECK: vmrh %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0x61
-#CHECK: vmrhb %v0, %v0, %v0
+# CHECK: vmrhb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x61
-#CHECK: vmrhb %v18, %v3, %v20
+# CHECK: vmrhb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x61
-#CHECK: vmrhb %v31, %v31, %v31
+# CHECK: vmrhb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x61
-#CHECK: vmrhf %v0, %v0, %v0
+# CHECK: vmrhf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0x61
-#CHECK: vmrhf %v18, %v3, %v20
+# CHECK: vmrhf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0x61
-#CHECK: vmrhf %v31, %v31, %v31
+# CHECK: vmrhf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0x61
-#CHECK: vmrhg %v0, %v0, %v0
+# CHECK: vmrhg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0x61
-#CHECK: vmrhg %v18, %v3, %v20
+# CHECK: vmrhg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0x61
-#CHECK: vmrhg %v31, %v31, %v31
+# CHECK: vmrhg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0x61
-#CHECK: vmrhh %v0, %v0, %v0
+# CHECK: vmrhh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0x61
-#CHECK: vmrhh %v18, %v3, %v20
+# CHECK: vmrhh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0x61
-#CHECK: vmrhh %v31, %v31, %v31
+# CHECK: vmrhh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0x61
-#CHECK: vmrl %v0, %v0, %v0, 11
+# CHECK: vmrl %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0x60
-#CHECK: vmrl %v18, %v3, %v20, 11
+# CHECK: vmrl %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0x60
-#CHECK: vmrl %v31, %v31, %v31, 11
+# CHECK: vmrl %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0x60
-#CHECK: vmrlb %v0, %v0, %v0
+# CHECK: vmrlb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x60
-#CHECK: vmrlb %v18, %v3, %v20
+# CHECK: vmrlb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x60
-#CHECK: vmrlb %v31, %v31, %v31
+# CHECK: vmrlb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x60
-#CHECK: vmrlf %v0, %v0, %v0
+# CHECK: vmrlf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0x60
-#CHECK: vmrlf %v18, %v3, %v20
+# CHECK: vmrlf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0x60
-#CHECK: vmrlf %v31, %v31, %v31
+# CHECK: vmrlf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0x60
-#CHECK: vmrlg %v0, %v0, %v0
+# CHECK: vmrlg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0x60
-#CHECK: vmrlg %v18, %v3, %v20
+# CHECK: vmrlg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0x60
-#CHECK: vmrlg %v31, %v31, %v31
+# CHECK: vmrlg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0x60
-#CHECK: vmrlh %v0, %v0, %v0
+# CHECK: vmrlh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0x60
-#CHECK: vmrlh %v18, %v3, %v20
+# CHECK: vmrlh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0x60
-#CHECK: vmrlh %v31, %v31, %v31
+# CHECK: vmrlh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0x60
-#CHECK: vmx %v0, %v0, %v0, 11
+# CHECK: vmx %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xff
-#CHECK: vmx %v18, %v3, %v20, 11
+# CHECK: vmx %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xff
-#CHECK: vmx %v31, %v31, %v31, 11
+# CHECK: vmx %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xff
-#CHECK: vmxb %v0, %v0, %v0
+# CHECK: vmxb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xff
-#CHECK: vmxb %v18, %v3, %v20
+# CHECK: vmxb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xff
-#CHECK: vmxb %v31, %v31, %v31
+# CHECK: vmxb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xff
-#CHECK: vmxf %v0, %v0, %v0
+# CHECK: vmxf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xff
-#CHECK: vmxf %v18, %v3, %v20
+# CHECK: vmxf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xff
-#CHECK: vmxf %v31, %v31, %v31
+# CHECK: vmxf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xff
-#CHECK: vmxg %v0, %v0, %v0
+# CHECK: vmxg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xff
-#CHECK: vmxg %v18, %v3, %v20
+# CHECK: vmxg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xff
-#CHECK: vmxg %v31, %v31, %v31
+# CHECK: vmxg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xff
-#CHECK: vmxh %v0, %v0, %v0
+# CHECK: vmxh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xff
-#CHECK: vmxh %v18, %v3, %v20
+# CHECK: vmxh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xff
-#CHECK: vmxh %v31, %v31, %v31
+# CHECK: vmxh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xff
-#CHECK: vmxl %v0, %v0, %v0, 11
+# CHECK: vmxl %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xfd
-#CHECK: vmxl %v18, %v3, %v20, 11
+# CHECK: vmxl %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xfd
-#CHECK: vmxl %v31, %v31, %v31, 11
+# CHECK: vmxl %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xfd
-#CHECK: vmxlb %v0, %v0, %v0
+# CHECK: vmxlb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xfd
-#CHECK: vmxlb %v18, %v3, %v20
+# CHECK: vmxlb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xfd
-#CHECK: vmxlb %v31, %v31, %v31
+# CHECK: vmxlb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xfd
-#CHECK: vmxlf %v0, %v0, %v0
+# CHECK: vmxlf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xfd
-#CHECK: vmxlf %v18, %v3, %v20
+# CHECK: vmxlf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xfd
-#CHECK: vmxlf %v31, %v31, %v31
+# CHECK: vmxlf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xfd
-#CHECK: vmxlg %v0, %v0, %v0
+# CHECK: vmxlg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xfd
-#CHECK: vmxlg %v18, %v3, %v20
+# CHECK: vmxlg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xfd
-#CHECK: vmxlg %v31, %v31, %v31
+# CHECK: vmxlg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xfd
-#CHECK: vmxlh %v0, %v0, %v0
+# CHECK: vmxlh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xfd
-#CHECK: vmxlh %v18, %v3, %v20
+# CHECK: vmxlh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xfd
-#CHECK: vmxlh %v31, %v31, %v31
+# CHECK: vmxlh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xfd
-#CHECK: vn %v0, %v0, %v0
+# CHECK: vn %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x68
-#CHECK: vn %v18, %v3, %v20
+# CHECK: vn %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x68
-#CHECK: vn %v31, %v31, %v31
+# CHECK: vn %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x68
-#CHECK: vnc %v0, %v0, %v0
+# CHECK: vnc %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x69
-#CHECK: vnc %v18, %v3, %v20
+# CHECK: vnc %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x69
-#CHECK: vnc %v31, %v31, %v31
+# CHECK: vnc %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x69
-#CHECK: vno %v0, %v0, %v0
+# CHECK: vno %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x6b
-#CHECK: vno %v18, %v3, %v20
+# CHECK: vno %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x6b
-#CHECK: vno %v31, %v31, %v31
+# CHECK: vno %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x6b
-#CHECK: vo %v0, %v0, %v0
+# CHECK: vo %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x6a
-#CHECK: vo %v18, %v3, %v20
+# CHECK: vo %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x6a
-#CHECK: vo %v31, %v31, %v31
+# CHECK: vo %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x6a
-#CHECK: vpdi %v0, %v0, %v0, 0
+# CHECK: vpdi %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x84
-#CHECK: vpdi %v3, %v20, %v5, 4
+# CHECK: vpdi %v3, %v20, %v5, 4
0xe7 0x34 0x50 0x00 0x44 0x84
-#CHECK: vpdi %v31, %v31, %v31, 15
+# CHECK: vpdi %v31, %v31, %v31, 15
0xe7 0xff 0xf0 0x00 0xfe 0x84
-#CHECK: vperm %v0, %v0, %v0, %v0
+# CHECK: vperm %v0, %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x8c
-#CHECK: vperm %v3, %v20, %v5, %v22
+# CHECK: vperm %v3, %v20, %v5, %v22
0xe7 0x34 0x50 0x00 0x65 0x8c
-#CHECK: vperm %v31, %v31, %v31, %v31
+# CHECK: vperm %v31, %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0xff 0x8c
-#CHECK: vpk %v0, %v0, %v0, 11
+# CHECK: vpk %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0x94
-#CHECK: vpk %v18, %v3, %v20, 11
+# CHECK: vpk %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0x94
-#CHECK: vpk %v31, %v31, %v31, 11
+# CHECK: vpk %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0x94
-#CHECK: vpkf %v0, %v0, %v0
+# CHECK: vpkf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0x94
-#CHECK: vpkf %v18, %v3, %v20
+# CHECK: vpkf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0x94
-#CHECK: vpkf %v31, %v31, %v31
+# CHECK: vpkf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0x94
-#CHECK: vpkg %v0, %v0, %v0
+# CHECK: vpkg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0x94
-#CHECK: vpkg %v18, %v3, %v20
+# CHECK: vpkg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0x94
-#CHECK: vpkg %v31, %v31, %v31
+# CHECK: vpkg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0x94
-#CHECK: vpkh %v0, %v0, %v0
+# CHECK: vpkh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0x94
-#CHECK: vpkh %v18, %v3, %v20
+# CHECK: vpkh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0x94
-#CHECK: vpkh %v31, %v31, %v31
+# CHECK: vpkh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0x94
-#CHECK: vpkls %v0, %v0, %v0, 11, 9
+# CHECK: vpkls %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x90 0xb0 0x95
-#CHECK: vpkls %v18, %v3, %v20, 11, 9
+# CHECK: vpkls %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x90 0xba 0x95
-#CHECK: vpkls %v31, %v31, %v31, 11, 9
+# CHECK: vpkls %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x90 0xbe 0x95
-#CHECK: vpklsf %v0, %v0, %v0
+# CHECK: vpklsf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0x95
-#CHECK: vpklsf %v18, %v3, %v20
+# CHECK: vpklsf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0x95
-#CHECK: vpklsfs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x24 0x95
-
-#CHECK: vpklsf %v31, %v31, %v31
+# CHECK: vpklsf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0x95
-#CHECK: vpklsg %v0, %v0, %v0
+# CHECK: vpklsfs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x24 0x95
+
+# CHECK: vpklsg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0x95
-#CHECK: vpklsg %v18, %v3, %v20
+# CHECK: vpklsg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0x95
-#CHECK: vpklsgs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x34 0x95
-
-#CHECK: vpklsg %v31, %v31, %v31
+# CHECK: vpklsg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0x95
-#CHECK: vpklsh %v0, %v0, %v0
+# CHECK: vpklsgs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x34 0x95
+
+# CHECK: vpklsh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0x95
-#CHECK: vpklsh %v18, %v3, %v20
+# CHECK: vpklsh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0x95
-#CHECK: vpklshs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x14 0x95
-
-#CHECK: vpklsh %v31, %v31, %v31
+# CHECK: vpklsh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0x95
-#CHECK: vpks %v0, %v0, %v0, 11, 9
+# CHECK: vpklshs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x14 0x95
+
+# CHECK: vpks %v0, %v0, %v0, 11, 9
0xe7 0x00 0x00 0x90 0xb0 0x97
-#CHECK: vpks %v18, %v3, %v20, 11, 9
+# CHECK: vpks %v18, %v3, %v20, 11, 9
0xe7 0x23 0x40 0x90 0xba 0x97
-#CHECK: vpks %v31, %v31, %v31, 11, 9
+# CHECK: vpks %v31, %v31, %v31, 11, 9
0xe7 0xff 0xf0 0x90 0xbe 0x97
-#CHECK: vpksf %v0, %v0, %v0
+# CHECK: vpksf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0x97
-#CHECK: vpksf %v18, %v3, %v20
+# CHECK: vpksf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0x97
-#CHECK: vpksfs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x24 0x97
-
-#CHECK: vpksf %v31, %v31, %v31
+# CHECK: vpksf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0x97
-#CHECK: vpksg %v0, %v0, %v0
+# CHECK: vpksfs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x24 0x97
+
+# CHECK: vpksg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0x97
-#CHECK: vpksg %v18, %v3, %v20
+# CHECK: vpksg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0x97
-#CHECK: vpksgs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x34 0x97
-
-#CHECK: vpksg %v31, %v31, %v31
+# CHECK: vpksg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0x97
-#CHECK: vpksh %v0, %v0, %v0
+# CHECK: vpksgs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x34 0x97
+
+# CHECK: vpksh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0x97
-#CHECK: vpksh %v18, %v3, %v20
+# CHECK: vpksh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0x97
-#CHECK: vpkshs %v7, %v24, %v9
-0xe7 0x78 0x90 0x10 0x14 0x97
-
-#CHECK: vpksh %v31, %v31, %v31
+# CHECK: vpksh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0x97
-#CHECK: vpopct %v0, %v0, 0
+# CHECK: vpkshs %v7, %v24, %v9
+0xe7 0x78 0x90 0x10 0x14 0x97
+
+# CHECK: vpopct %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x50
-#CHECK: vpopct %v19, %v14, 0
+# CHECK: vpopct %v19, %v14, 0
0xe7 0x3e 0x00 0x00 0x08 0x50
-#CHECK: vpopct %v31, %v31
+# CHECK: vpopct %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0x50
-#CHECK: vrep %v0, %v0, 0, 11
+# CHECK: vrep %v0, %v0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x4d
-#CHECK: vrep %v19, %v4, 22136, 11
+# CHECK: vrep %v19, %v4, 22136, 11
0xe7 0x34 0x56 0x78 0xb8 0x4d
-#CHECK: vrep %v31, %v31, 65535, 11
+# CHECK: vrep %v31, %v31, 65535, 11
0xe7 0xff 0xff 0xff 0xbc 0x4d
-#CHECK: vrepb %v0, %v0, 0
+# CHECK: vrepb %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x4d
-#CHECK: vrepb %v19, %v4, 22136
+# CHECK: vrepb %v19, %v4, 22136
0xe7 0x34 0x56 0x78 0x08 0x4d
-#CHECK: vrepb %v31, %v31, 65535
+# CHECK: vrepb %v31, %v31, 65535
0xe7 0xff 0xff 0xff 0x0c 0x4d
-#CHECK: vrepf %v0, %v0, 0
+# CHECK: vrepf %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x4d
-#CHECK: vrepf %v19, %v4, 22136
+# CHECK: vrepf %v19, %v4, 22136
0xe7 0x34 0x56 0x78 0x28 0x4d
-#CHECK: vrepf %v31, %v31, 65535
+# CHECK: vrepf %v31, %v31, 65535
0xe7 0xff 0xff 0xff 0x2c 0x4d
-#CHECK: vrepg %v0, %v0, 0
+# CHECK: vrepg %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x30 0x4d
-#CHECK: vrepg %v19, %v4, 22136
+# CHECK: vrepg %v19, %v4, 22136
0xe7 0x34 0x56 0x78 0x38 0x4d
-#CHECK: vrepg %v31, %v31, 65535
+# CHECK: vrepg %v31, %v31, 65535
0xe7 0xff 0xff 0xff 0x3c 0x4d
-#CHECK: vreph %v0, %v0, 0
+# CHECK: vreph %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x4d
-#CHECK: vreph %v19, %v4, 22136
+# CHECK: vreph %v19, %v4, 22136
0xe7 0x34 0x56 0x78 0x18 0x4d
-#CHECK: vreph %v31, %v31, 65535
+# CHECK: vreph %v31, %v31, 65535
0xe7 0xff 0xff 0xff 0x1c 0x4d
-#CHECK: vrepi %v0, 0, 11
+# CHECK: vrepi %v0, 0, 11
0xe7 0x00 0x00 0x00 0xb0 0x45
-#CHECK: vrepi %v23, -30293, 11
+# CHECK: vrepi %v23, -30293, 11
0xe7 0x70 0x89 0xab 0xb8 0x45
-#CHECK: vrepi %v31, -1, 11
+# CHECK: vrepi %v31, -1, 11
0xe7 0xf0 0xff 0xff 0xb8 0x45
-#CHECK: vrepib %v0, 0
+# CHECK: vrepib %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x45
-#CHECK: vrepib %v23, -30293
+# CHECK: vrepib %v23, -30293
0xe7 0x70 0x89 0xab 0x08 0x45
-#CHECK: vrepib %v31, -1
+# CHECK: vrepib %v31, -1
0xe7 0xf0 0xff 0xff 0x08 0x45
-#CHECK: vrepif %v0, 0
+# CHECK: vrepif %v0, 0
0xe7 0x00 0x00 0x00 0x20 0x45
-#CHECK: vrepif %v23, -30293
+# CHECK: vrepif %v23, -30293
0xe7 0x70 0x89 0xab 0x28 0x45
-#CHECK: vrepif %v31, -1
+# CHECK: vrepif %v31, -1
0xe7 0xf0 0xff 0xff 0x28 0x45
-#CHECK: vrepig %v0, 0
+# CHECK: vrepig %v0, 0
0xe7 0x00 0x00 0x00 0x30 0x45
-#CHECK: vrepig %v23, -30293
+# CHECK: vrepig %v23, -30293
0xe7 0x70 0x89 0xab 0x38 0x45
-#CHECK: vrepig %v31, -1
+# CHECK: vrepig %v31, -1
0xe7 0xf0 0xff 0xff 0x38 0x45
-#CHECK: vrepih %v0, 0
+# CHECK: vrepih %v0, 0
0xe7 0x00 0x00 0x00 0x10 0x45
-#CHECK: vrepih %v23, -30293
+# CHECK: vrepih %v23, -30293
0xe7 0x70 0x89 0xab 0x18 0x45
-#CHECK: vrepih %v31, -1
+# CHECK: vrepih %v31, -1
0xe7 0xf0 0xff 0xff 0x18 0x45
-#CHECK: vs %v0, %v0, %v0, 11
+# CHECK: vs %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xf7
-#CHECK: vs %v18, %v3, %v20, 11
+# CHECK: vs %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xf7
-#CHECK: vs %v31, %v31, %v31, 11
+# CHECK: vs %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xf7
-#CHECK: vsb %v0, %v0, %v0
+# CHECK: vsb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xf7
-#CHECK: vsb %v18, %v3, %v20
+# CHECK: vsb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xf7
-#CHECK: vsb %v31, %v31, %v31
+# CHECK: vsb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xf7
-#CHECK: vsbi %v0, %v0, %v0, %v0, 11
-0xe7 0x00 0x0b 0x00 0x00 0xbf
-
-#CHECK: vsbi %v3, %v20, %v5, %v22, 11
-0xe7 0x34 0x5b 0x00 0x65 0xbf
-
-#CHECK: vsbi %v31, %v31, %v31, %v31, 11
-0xe7 0xff 0xfb 0x00 0xff 0xbf
-
-#CHECK: vsbiq %v0, %v0, %v0, %v0
-0xe7 0x00 0x04 0x00 0x00 0xbf
-
-#CHECK: vsbiq %v3, %v20, %v5, %v22
-0xe7 0x34 0x54 0x00 0x65 0xbf
-
-#CHECK: vsbiq %v31, %v31, %v31, %v31
-0xe7 0xff 0xf4 0x00 0xff 0xbf
-
-#CHECK: vsbcbi %v0, %v0, %v0, %v0, 11
+# CHECK: vsbcbi %v0, %v0, %v0, %v0, 11
0xe7 0x00 0x0b 0x00 0x00 0xbd
-#CHECK: vsbcbi %v3, %v20, %v5, %v22, 11
+# CHECK: vsbcbi %v3, %v20, %v5, %v22, 11
0xe7 0x34 0x5b 0x00 0x65 0xbd
-#CHECK: vsbcbi %v31, %v31, %v31, %v31, 11
+# CHECK: vsbcbi %v31, %v31, %v31, %v31, 11
0xe7 0xff 0xfb 0x00 0xff 0xbd
-#CHECK: vsbcbiq %v0, %v0, %v0, %v0
+# CHECK: vsbcbiq %v0, %v0, %v0, %v0
0xe7 0x00 0x04 0x00 0x00 0xbd
-#CHECK: vsbcbiq %v3, %v20, %v5, %v22
+# CHECK: vsbcbiq %v3, %v20, %v5, %v22
0xe7 0x34 0x54 0x00 0x65 0xbd
-#CHECK: vsbcbiq %v31, %v31, %v31, %v31
+# CHECK: vsbcbiq %v31, %v31, %v31, %v31
0xe7 0xff 0xf4 0x00 0xff 0xbd
-#CHECK: vscbi %v0, %v0, %v0, 11
+# CHECK: vsbi %v0, %v0, %v0, %v0, 11
+0xe7 0x00 0x0b 0x00 0x00 0xbf
+
+# CHECK: vsbi %v3, %v20, %v5, %v22, 11
+0xe7 0x34 0x5b 0x00 0x65 0xbf
+
+# CHECK: vsbi %v31, %v31, %v31, %v31, 11
+0xe7 0xff 0xfb 0x00 0xff 0xbf
+
+# CHECK: vsbiq %v0, %v0, %v0, %v0
+0xe7 0x00 0x04 0x00 0x00 0xbf
+
+# CHECK: vsbiq %v3, %v20, %v5, %v22
+0xe7 0x34 0x54 0x00 0x65 0xbf
+
+# CHECK: vsbiq %v31, %v31, %v31, %v31
+0xe7 0xff 0xf4 0x00 0xff 0xbf
+
+# CHECK: vscbi %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xf5
-#CHECK: vscbi %v18, %v3, %v20, 11
+# CHECK: vscbi %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0xf5
-#CHECK: vscbi %v31, %v31, %v31, 11
+# CHECK: vscbi %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0xf5
-#CHECK: vscbib %v0, %v0, %v0
+# CHECK: vscbib %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xf5
-#CHECK: vscbib %v18, %v3, %v20
+# CHECK: vscbib %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0xf5
-#CHECK: vscbib %v31, %v31, %v31
+# CHECK: vscbib %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0xf5
-#CHECK: vscbif %v0, %v0, %v0
+# CHECK: vscbif %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xf5
-#CHECK: vscbif %v18, %v3, %v20
+# CHECK: vscbif %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xf5
-#CHECK: vscbif %v31, %v31, %v31
+# CHECK: vscbif %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xf5
-#CHECK: vscbig %v0, %v0, %v0
+# CHECK: vscbig %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xf5
-#CHECK: vscbig %v18, %v3, %v20
+# CHECK: vscbig %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xf5
-#CHECK: vscbig %v31, %v31, %v31
+# CHECK: vscbig %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xf5
-#CHECK: vscbih %v0, %v0, %v0
+# CHECK: vscbih %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xf5
-#CHECK: vscbih %v18, %v3, %v20
+# CHECK: vscbih %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xf5
-#CHECK: vscbih %v31, %v31, %v31
+# CHECK: vscbih %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xf5
-#CHECK: vscbiq %v0, %v0, %v0
+# CHECK: vscbiq %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x40 0xf5
-#CHECK: vscbiq %v18, %v3, %v20
+# CHECK: vscbiq %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x4a 0xf5
-#CHECK: vscbiq %v31, %v31, %v31
+# CHECK: vscbiq %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x4e 0xf5
-#CHECK: vscef %v0, 0(%v0), 0
+# CHECK: vscef %v0, 0(%v0), 0
0xe7 0x00 0x00 0x00 0x00 0x1b
-#CHECK: vscef %v10, 1000(%v19,%r7), 2
+# CHECK: vscef %v10, 1000(%v19,%r7), 2
0xe7 0xa3 0x73 0xe8 0x24 0x1b
-#CHECK: vscef %v31, 4095(%v31,%r15), 3
+# CHECK: vscef %v31, 4095(%v31,%r15), 3
0xe7 0xff 0xff 0xff 0x3c 0x1b
-#CHECK: vsceg %v0, 0(%v0), 0
+# CHECK: vsceg %v0, 0(%v0), 0
0xe7 0x00 0x00 0x00 0x00 0x1a
-#CHECK: vsceg %v10, 1000(%v19,%r7), 1
+# CHECK: vsceg %v10, 1000(%v19,%r7), 1
0xe7 0xa3 0x73 0xe8 0x14 0x1a
-#CHECK: vsceg %v31, 4095(%v31,%r15), 1
+# CHECK: vsceg %v31, 4095(%v31,%r15), 1
0xe7 0xff 0xff 0xff 0x1c 0x1a
-#CHECK: vseg %v0, %v0, 11
+# CHECK: vseg %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0x5f
-#CHECK: vseg %v19, %v14, 11
+# CHECK: vseg %v19, %v14, 11
0xe7 0x3e 0x00 0x00 0xb8 0x5f
-#CHECK: vseg %v31, %v31, 11
+# CHECK: vseg %v31, %v31, 11
0xe7 0xff 0x00 0x00 0xbc 0x5f
-#CHECK: vsegb %v0, %v0
+# CHECK: vsegb %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x5f
-#CHECK: vsegb %v19, %v14
+# CHECK: vsegb %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0x5f
-#CHECK: vsegb %v31, %v31
+# CHECK: vsegb %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0x5f
-#CHECK: vsegf %v0, %v0
+# CHECK: vsegf %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0x5f
-#CHECK: vsegf %v19, %v14
+# CHECK: vsegf %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0x5f
-#CHECK: vsegf %v31, %v31
+# CHECK: vsegf %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0x5f
-#CHECK: vsegh %v0, %v0
+# CHECK: vsegh %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0x5f
-#CHECK: vsegh %v19, %v14
+# CHECK: vsegh %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0x5f
-#CHECK: vsegh %v31, %v31
+# CHECK: vsegh %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0x5f
-#CHECK: vsel %v0, %v0, %v0, %v0
+# CHECK: vsel %v0, %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x8d
-#CHECK: vsel %v3, %v20, %v5, %v22
+# CHECK: vsel %v3, %v20, %v5, %v22
0xe7 0x34 0x50 0x00 0x65 0x8d
-#CHECK: vsel %v31, %v31, %v31, %v31
+# CHECK: vsel %v31, %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0xff 0x8d
-#CHECK: vsf %v0, %v0, %v0
+# CHECK: vsf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xf7
-#CHECK: vsf %v18, %v3, %v20
+# CHECK: vsf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0xf7
-#CHECK: vsf %v31, %v31, %v31
+# CHECK: vsf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0xf7
-#CHECK: vsg %v0, %v0, %v0
+# CHECK: vsg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0xf7
-#CHECK: vsg %v18, %v3, %v20
+# CHECK: vsg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0xf7
-#CHECK: vsg %v31, %v31, %v31
+# CHECK: vsg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0xf7
-#CHECK: vsh %v0, %v0, %v0
+# CHECK: vsh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xf7
-#CHECK: vsh %v18, %v3, %v20
+# CHECK: vsh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0xf7
-#CHECK: vsh %v31, %v31, %v31
+# CHECK: vsh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0xf7
-#CHECK: vsl %v0, %v0, %v0
+# CHECK: vsl %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x74
-#CHECK: vsl %v18, %v3, %v20
+# CHECK: vsl %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x74
-#CHECK: vsl %v31, %v31, %v31
+# CHECK: vsl %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x74
-#CHECK: vslb %v0, %v0, %v0
+# CHECK: vslb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x75
-#CHECK: vslb %v18, %v3, %v20
+# CHECK: vslb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x75
-#CHECK: vslb %v31, %v31, %v31
+# CHECK: vslb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x75
-#CHECK: vsldb %v0, %v0, %v0, 0
+# CHECK: vsldb %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x77
-#CHECK: vsldb %v3, %v20, %v5, 103
+# CHECK: vsldb %v3, %v20, %v5, 103
0xe7 0x34 0x50 0x67 0x04 0x77
-#CHECK: vsldb %v31, %v31, %v31, 255
+# CHECK: vsldb %v31, %v31, %v31, 255
0xe7 0xff 0xf0 0xff 0x0e 0x77
-#CHECK: vsq %v0, %v0, %v0
+# CHECK: vsq %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x40 0xf7
-#CHECK: vsq %v18, %v3, %v20
+# CHECK: vsq %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x4a 0xf7
-#CHECK: vsq %v31, %v31, %v31
+# CHECK: vsq %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x4e 0xf7
-#CHECK: vsra %v0, %v0, %v0
+# CHECK: vsra %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x7e
-#CHECK: vsra %v18, %v3, %v20
+# CHECK: vsra %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x7e
-#CHECK: vsra %v31, %v31, %v31
+# CHECK: vsra %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x7e
-#CHECK: vsrab %v0, %v0, %v0
+# CHECK: vsrab %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x7f
-#CHECK: vsrab %v18, %v3, %v20
+# CHECK: vsrab %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x7f
-#CHECK: vsrab %v31, %v31, %v31
+# CHECK: vsrab %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x7f
-#CHECK: vsrl %v0, %v0, %v0
+# CHECK: vsrl %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x7c
-#CHECK: vsrl %v18, %v3, %v20
+# CHECK: vsrl %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x7c
-#CHECK: vsrl %v31, %v31, %v31
+# CHECK: vsrl %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x7c
-#CHECK: vsrlb %v0, %v0, %v0
+# CHECK: vsrlb %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x7d
-#CHECK: vsrlb %v18, %v3, %v20
+# CHECK: vsrlb %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x7d
-#CHECK: vsrlb %v31, %v31, %v31
+# CHECK: vsrlb %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x7d
-#CHECK: vst %v0, 0
+# CHECK: vst %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x0E
-#CHECK: vst %v17, 2475(%r7,%r8)
+# CHECK: vst %v17, 2475(%r7,%r8)
0xe7 0x17 0x89 0xab 0x08 0x0E
-#CHECK: vst %v31, 4095(%r15,%r15)
+# CHECK: vst %v31, 4095(%r15,%r15)
0xe7 0xff 0xff 0xff 0x08 0x0E
-#CHECK: vsteb %v0, 0, 0
+# CHECK: vsteb %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x08
-#CHECK: vsteb %v17, 2475(%r7,%r8), 12
+# CHECK: vsteb %v17, 2475(%r7,%r8), 12
0xe7 0x17 0x89 0xab 0xc8 0x08
-#CHECK: vsteb %v31, 4095(%r15,%r15), 15
+# CHECK: vsteb %v31, 4095(%r15,%r15), 15
0xe7 0xff 0xff 0xff 0xf8 0x08
-#CHECK: vstef %v0, 0, 0
+# CHECK: vstef %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x0b
-#CHECK: vstef %v17, 2475(%r7,%r8), 2
+# CHECK: vstef %v17, 2475(%r7,%r8), 2
0xe7 0x17 0x89 0xab 0x28 0x0b
-#CHECK: vstef %v31, 4095(%r15,%r15), 3
+# CHECK: vstef %v31, 4095(%r15,%r15), 3
0xe7 0xff 0xff 0xff 0x38 0x0b
-#CHECK: vsteg %v0, 0, 0
+# CHECK: vsteg %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x0a
-#CHECK: vsteg %v17, 2475(%r7,%r8), 1
+# CHECK: vsteg %v17, 2475(%r7,%r8), 1
0xe7 0x17 0x89 0xab 0x18 0x0a
-#CHECK: vsteg %v31, 4095(%r15,%r15), 1
+# CHECK: vsteg %v31, 4095(%r15,%r15), 1
0xe7 0xff 0xff 0xff 0x18 0x0a
-#CHECK: vsteh %v0, 0, 0
+# CHECK: vsteh %v0, 0, 0
0xe7 0x00 0x00 0x00 0x00 0x09
-#CHECK: vsteh %v17, 2475(%r7,%r8), 5
+# CHECK: vsteh %v17, 2475(%r7,%r8), 5
0xe7 0x17 0x89 0xab 0x58 0x09
-#CHECK: vsteh %v31, 4095(%r15,%r15), 7
+# CHECK: vsteh %v31, 4095(%r15,%r15), 7
0xe7 0xff 0xff 0xff 0x78 0x09
-#CHECK: vstl %v0, %r0, 0
+# CHECK: vstl %v0, %r0, 0
0xe7 0x00 0x00 0x00 0x00 0x3f
-#CHECK: vstl %v18, %r3, 1383(%r4)
+# CHECK: vstl %v18, %r3, 1383(%r4)
0xe7 0x23 0x45 0x67 0x08 0x3f
-#CHECK: vstl %v31, %r15, 4095(%r15)
+# CHECK: vstl %v31, %r15, 4095(%r15)
0xe7 0xff 0xff 0xff 0x08 0x3f
-#CHECK: vstm %v0, %v0, 0
+# CHECK: vstm %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x3e
-#CHECK: vstm %v12, %v18, 1110(%r3)
+# CHECK: vstm %v12, %v18, 1110(%r3)
0xe7 0xc2 0x34 0x56 0x04 0x3e
-#CHECK: vstm %v31, %v31, 4095(%r15)
+# CHECK: vstm %v31, %v31, 4095(%r15)
0xe7 0xff 0xff 0xff 0x0c 0x3e
-#CHECK: vstrc %v0, %v0, %v0, %v0, 11, 0
+# CHECK: vstrc %v0, %v0, %v0, %v0, 11, 0
0xe7 0x00 0x0b 0x00 0x00 0x8a
-#CHECK: vstrc %v0, %v0, %v0, %v0, 11, 12
+# CHECK: vstrc %v0, %v0, %v0, %v0, 11, 12
0xe7 0x00 0x0b 0xc0 0x00 0x8a
-#CHECK: vstrc %v18, %v3, %v20, %v5, 11, 0
+# CHECK: vstrc %v18, %v3, %v20, %v5, 11, 0
0xe7 0x23 0x4b 0x00 0x5a 0x8a
-#CHECK: vstrc %v31, %v31, %v31, %v31, 11, 4
+# CHECK: vstrc %v31, %v31, %v31, %v31, 11, 4
0xe7 0xff 0xfb 0x40 0xff 0x8a
-#CHECK: vstrcb %v0, %v0, %v0, %v0, 0
+# CHECK: vstrcb %v0, %v0, %v0, %v0, 0
0xe7 0x00 0x00 0x00 0x00 0x8a
-#CHECK: vstrcb %v0, %v0, %v0, %v0, 12
+# CHECK: vstrcb %v0, %v0, %v0, %v0, 12
0xe7 0x00 0x00 0xc0 0x00 0x8a
-#CHECK: vstrcb %v18, %v3, %v20, %v5, 0
+# CHECK: vstrcb %v18, %v3, %v20, %v5, 0
0xe7 0x23 0x40 0x00 0x5a 0x8a
-#CHECK: vstrcb %v31, %v31, %v31, %v31, 4
+# CHECK: vstrcb %v31, %v31, %v31, %v31, 4
0xe7 0xff 0xf0 0x40 0xff 0x8a
-#CHECK: vstrcbs %v31, %v31, %v31, %v31, 8
+# CHECK: vstrcbs %v31, %v31, %v31, %v31, 8
0xe7 0xff 0xf0 0x90 0xff 0x8a
-#CHECK: vstrczb %v31, %v31, %v31, %v31, 4
+# CHECK: vstrczb %v31, %v31, %v31, %v31, 4
0xe7 0xff 0xf0 0x60 0xff 0x8a
-#CHECK: vstrczbs %v31, %v31, %v31, %v31, 8
+# CHECK: vstrczbs %v31, %v31, %v31, %v31, 8
0xe7 0xff 0xf0 0xb0 0xff 0x8a
-#CHECK: vstrcf %v0, %v0, %v0, %v0, 0
+# CHECK: vstrcf %v0, %v0, %v0, %v0, 0
0xe7 0x00 0x02 0x00 0x00 0x8a
-#CHECK: vstrcf %v0, %v0, %v0, %v0, 12
+# CHECK: vstrcf %v0, %v0, %v0, %v0, 12
0xe7 0x00 0x02 0xc0 0x00 0x8a
-#CHECK: vstrcf %v18, %v3, %v20, %v5, 0
+# CHECK: vstrcf %v18, %v3, %v20, %v5, 0
0xe7 0x23 0x42 0x00 0x5a 0x8a
-#CHECK: vstrcf %v31, %v31, %v31, %v31, 4
+# CHECK: vstrcf %v31, %v31, %v31, %v31, 4
0xe7 0xff 0xf2 0x40 0xff 0x8a
-#CHECK: vstrcfs %v31, %v31, %v31, %v31, 8
+# CHECK: vstrcfs %v31, %v31, %v31, %v31, 8
0xe7 0xff 0xf2 0x90 0xff 0x8a
-#CHECK: vstrczf %v31, %v31, %v31, %v31, 4
+# CHECK: vstrczf %v31, %v31, %v31, %v31, 4
0xe7 0xff 0xf2 0x60 0xff 0x8a
-#CHECK: vstrczfs %v31, %v31, %v31, %v31, 8
+# CHECK: vstrczfs %v31, %v31, %v31, %v31, 8
0xe7 0xff 0xf2 0xb0 0xff 0x8a
-#CHECK: vstrch %v0, %v0, %v0, %v0, 0
+# CHECK: vstrch %v0, %v0, %v0, %v0, 0
0xe7 0x00 0x01 0x00 0x00 0x8a
-#CHECK: vstrch %v0, %v0, %v0, %v0, 12
+# CHECK: vstrch %v0, %v0, %v0, %v0, 12
0xe7 0x00 0x01 0xc0 0x00 0x8a
-#CHECK: vstrch %v18, %v3, %v20, %v5, 0
+# CHECK: vstrch %v18, %v3, %v20, %v5, 0
0xe7 0x23 0x41 0x00 0x5a 0x8a
-#CHECK: vstrch %v31, %v31, %v31, %v31, 4
+# CHECK: vstrch %v31, %v31, %v31, %v31, 4
0xe7 0xff 0xf1 0x40 0xff 0x8a
-#CHECK: vstrchs %v31, %v31, %v31, %v31, 8
+# CHECK: vstrchs %v31, %v31, %v31, %v31, 8
0xe7 0xff 0xf1 0x90 0xff 0x8a
-#CHECK: vstrczh %v31, %v31, %v31, %v31, 4
+# CHECK: vstrczh %v31, %v31, %v31, %v31, 4
0xe7 0xff 0xf1 0x60 0xff 0x8a
-#CHECK: vstrczhs %v31, %v31, %v31, %v31, 8
+# CHECK: vstrczhs %v31, %v31, %v31, %v31, 8
0xe7 0xff 0xf1 0xb0 0xff 0x8a
-#CHECK: vsumg %v0, %v0, %v0, 11
+# CHECK: vsum %v0, %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0x64
+
+# CHECK: vsum %v18, %v3, %v20, 11
+0xe7 0x23 0x40 0x00 0xba 0x64
+
+# CHECK: vsum %v31, %v31, %v31, 11
+0xe7 0xff 0xf0 0x00 0xbe 0x64
+
+# CHECK: vsumb %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0x64
+
+# CHECK: vsumb %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x0a 0x64
+
+# CHECK: vsumb %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x0e 0x64
+
+# CHECK: vsumg %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0x65
-#CHECK: vsumg %v18, %v3, %v20, 11
+# CHECK: vsumg %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0x65
-#CHECK: vsumg %v31, %v31, %v31, 11
+# CHECK: vsumg %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0x65
-#CHECK: vsumgh %v0, %v0, %v0
+# CHECK: vsumgf %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0x65
+
+# CHECK: vsumgf %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x2a 0x65
+
+# CHECK: vsumgf %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x2e 0x65
+
+# CHECK: vsumgh %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0x65
-#CHECK: vsumgh %v18, %v3, %v20
+# CHECK: vsumgh %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x1a 0x65
-#CHECK: vsumgh %v31, %v31, %v31
+# CHECK: vsumgh %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x1e 0x65
-#CHECK: vsumgf %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0x65
+# CHECK: vsumh %v0, %v0, %v0
+0xe7 0x00 0x00 0x00 0x10 0x64
-#CHECK: vsumgf %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x2a 0x65
+# CHECK: vsumh %v18, %v3, %v20
+0xe7 0x23 0x40 0x00 0x1a 0x64
-#CHECK: vsumgf %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x2e 0x65
+# CHECK: vsumh %v31, %v31, %v31
+0xe7 0xff 0xf0 0x00 0x1e 0x64
-#CHECK: vsumq %v0, %v0, %v0, 11
+# CHECK: vsumq %v0, %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0x67
-#CHECK: vsumq %v18, %v3, %v20, 11
+# CHECK: vsumq %v18, %v3, %v20, 11
0xe7 0x23 0x40 0x00 0xba 0x67
-#CHECK: vsumq %v31, %v31, %v31, 11
+# CHECK: vsumq %v31, %v31, %v31, 11
0xe7 0xff 0xf0 0x00 0xbe 0x67
-#CHECK: vsumqf %v0, %v0, %v0
+# CHECK: vsumqf %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0x67
-#CHECK: vsumqf %v18, %v3, %v20
+# CHECK: vsumqf %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x2a 0x67
-#CHECK: vsumqf %v31, %v31, %v31
+# CHECK: vsumqf %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x2e 0x67
-#CHECK: vsumqg %v0, %v0, %v0
+# CHECK: vsumqg %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x30 0x67
-#CHECK: vsumqg %v18, %v3, %v20
+# CHECK: vsumqg %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x3a 0x67
-#CHECK: vsumqg %v31, %v31, %v31
+# CHECK: vsumqg %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x3e 0x67
-#CHECK: vsum %v0, %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0x64
-
-#CHECK: vsum %v18, %v3, %v20, 11
-0xe7 0x23 0x40 0x00 0xba 0x64
-
-#CHECK: vsum %v31, %v31, %v31, 11
-0xe7 0xff 0xf0 0x00 0xbe 0x64
-
-#CHECK: vsumb %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0x64
-
-#CHECK: vsumb %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x0a 0x64
-
-#CHECK: vsumb %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x0e 0x64
-
-#CHECK: vsumh %v0, %v0, %v0
-0xe7 0x00 0x00 0x00 0x10 0x64
-
-#CHECK: vsumh %v18, %v3, %v20
-0xe7 0x23 0x40 0x00 0x1a 0x64
-
-#CHECK: vsumh %v31, %v31, %v31
-0xe7 0xff 0xf0 0x00 0x1e 0x64
-
-#CHECK: vtm %v0, %v0
+# CHECK: vtm %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xd8
-#CHECK: vtm %v19, %v14
+# CHECK: vtm %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0xd8
-#CHECK: vtm %v31, %v31
+# CHECK: vtm %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0xd8
-#CHECK: vuph %v0, %v0, 11
+# CHECK: vuph %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xd7
-#CHECK: vuph %v19, %v14, 11
+# CHECK: vuph %v19, %v14, 11
0xe7 0x3e 0x00 0x00 0xb8 0xd7
-#CHECK: vuph %v31, %v31, 11
+# CHECK: vuph %v31, %v31, 11
0xe7 0xff 0x00 0x00 0xbc 0xd7
-#CHECK: vuphb %v0, %v0
+# CHECK: vuphb %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xd7
-#CHECK: vuphb %v19, %v14
+# CHECK: vuphb %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0xd7
-#CHECK: vuphb %v31, %v31
+# CHECK: vuphb %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0xd7
-#CHECK: vuphf %v0, %v0
+# CHECK: vuphf %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xd7
-#CHECK: vuphf %v19, %v14
+# CHECK: vuphf %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0xd7
-#CHECK: vuphf %v31, %v31
+# CHECK: vuphf %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0xd7
-#CHECK: vuphh %v0, %v0
+# CHECK: vuphh %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xd7
-#CHECK: vuphh %v19, %v14
+# CHECK: vuphh %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0xd7
-#CHECK: vuphh %v31, %v31
+# CHECK: vuphh %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0xd7
-#CHECK: vuplh %v0, %v0, 11
+# CHECK: vupl %v0, %v0, 11
+0xe7 0x00 0x00 0x00 0xb0 0xd6
+
+# CHECK: vupl %v19, %v14, 11
+0xe7 0x3e 0x00 0x00 0xb8 0xd6
+
+# CHECK: vupl %v31, %v31, 11
+0xe7 0xff 0x00 0x00 0xbc 0xd6
+
+# CHECK: vuplb %v0, %v0
+0xe7 0x00 0x00 0x00 0x00 0xd6
+
+# CHECK: vuplb %v19, %v14
+0xe7 0x3e 0x00 0x00 0x08 0xd6
+
+# CHECK: vuplb %v31, %v31
+0xe7 0xff 0x00 0x00 0x0c 0xd6
+
+# CHECK: vuplf %v0, %v0
+0xe7 0x00 0x00 0x00 0x20 0xd6
+
+# CHECK: vuplf %v19, %v14
+0xe7 0x3e 0x00 0x00 0x28 0xd6
+
+# CHECK: vuplf %v31, %v31
+0xe7 0xff 0x00 0x00 0x2c 0xd6
+
+# CHECK: vuplh %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xd5
-#CHECK: vuplh %v19, %v14, 11
+# CHECK: vuplh %v19, %v14, 11
0xe7 0x3e 0x00 0x00 0xb8 0xd5
-#CHECK: vuplh %v31, %v31, 11
+# CHECK: vuplh %v31, %v31, 11
0xe7 0xff 0x00 0x00 0xbc 0xd5
-#CHECK: vuplhb %v0, %v0
+# CHECK: vuplhb %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xd5
-#CHECK: vuplhb %v19, %v14
+# CHECK: vuplhb %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0xd5
-#CHECK: vuplhb %v31, %v31
+# CHECK: vuplhb %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0xd5
-#CHECK: vuplhf %v0, %v0
+# CHECK: vuplhf %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xd5
-#CHECK: vuplhf %v19, %v14
+# CHECK: vuplhf %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0xd5
-#CHECK: vuplhf %v31, %v31
+# CHECK: vuplhf %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0xd5
-#CHECK: vuplhh %v0, %v0
+# CHECK: vuplhh %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xd5
-#CHECK: vuplhh %v19, %v14
+# CHECK: vuplhh %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0xd5
-#CHECK: vuplhh %v31, %v31
+# CHECK: vuplhh %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0xd5
-#CHECK: vupl %v0, %v0, 11
-0xe7 0x00 0x00 0x00 0xb0 0xd6
-
-#CHECK: vupl %v19, %v14, 11
-0xe7 0x3e 0x00 0x00 0xb8 0xd6
-
-#CHECK: vupl %v31, %v31, 11
-0xe7 0xff 0x00 0x00 0xbc 0xd6
-
-#CHECK: vuplb %v0, %v0
-0xe7 0x00 0x00 0x00 0x00 0xd6
-
-#CHECK: vuplb %v19, %v14
-0xe7 0x3e 0x00 0x00 0x08 0xd6
-
-#CHECK: vuplb %v31, %v31
-0xe7 0xff 0x00 0x00 0x0c 0xd6
-
-#CHECK: vuplf %v0, %v0
-0xe7 0x00 0x00 0x00 0x20 0xd6
-
-#CHECK: vuplf %v19, %v14
-0xe7 0x3e 0x00 0x00 0x28 0xd6
-
-#CHECK: vuplf %v31, %v31
-0xe7 0xff 0x00 0x00 0x2c 0xd6
-
-#CHECK: vuplhw %v0, %v0
+# CHECK: vuplhw %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xd6
-#CHECK: vuplhw %v19, %v14
+# CHECK: vuplhw %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0xd6
-#CHECK: vuplhw %v31, %v31
+# CHECK: vuplhw %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0xd6
-#CHECK: vupll %v0, %v0, 11
+# CHECK: vupll %v0, %v0, 11
0xe7 0x00 0x00 0x00 0xb0 0xd4
-#CHECK: vupll %v19, %v14, 11
+# CHECK: vupll %v19, %v14, 11
0xe7 0x3e 0x00 0x00 0xb8 0xd4
-#CHECK: vupll %v31, %v31, 11
+# CHECK: vupll %v31, %v31, 11
0xe7 0xff 0x00 0x00 0xbc 0xd4
-#CHECK: vupllb %v0, %v0
+# CHECK: vupllb %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0xd4
-#CHECK: vupllb %v19, %v14
+# CHECK: vupllb %v19, %v14
0xe7 0x3e 0x00 0x00 0x08 0xd4
-#CHECK: vupllb %v31, %v31
+# CHECK: vupllb %v31, %v31
0xe7 0xff 0x00 0x00 0x0c 0xd4
-#CHECK: vupllf %v0, %v0
+# CHECK: vupllf %v0, %v0
0xe7 0x00 0x00 0x00 0x20 0xd4
-#CHECK: vupllf %v19, %v14
+# CHECK: vupllf %v19, %v14
0xe7 0x3e 0x00 0x00 0x28 0xd4
-#CHECK: vupllf %v31, %v31
+# CHECK: vupllf %v31, %v31
0xe7 0xff 0x00 0x00 0x2c 0xd4
-#CHECK: vupllh %v0, %v0
+# CHECK: vupllh %v0, %v0
0xe7 0x00 0x00 0x00 0x10 0xd4
-#CHECK: vupllh %v19, %v14
+# CHECK: vupllh %v19, %v14
0xe7 0x3e 0x00 0x00 0x18 0xd4
-#CHECK: vupllh %v31, %v31
+# CHECK: vupllh %v31, %v31
0xe7 0xff 0x00 0x00 0x1c 0xd4
-#CHECK: vx %v0, %v0, %v0
+# CHECK: vx %v0, %v0, %v0
0xe7 0x00 0x00 0x00 0x00 0x6d
-#CHECK: vx %v18, %v3, %v20
+# CHECK: vx %v18, %v3, %v20
0xe7 0x23 0x40 0x00 0x0a 0x6d
-#CHECK: vx %v31, %v31, %v31
+# CHECK: vx %v31, %v31, %v31
0xe7 0xff 0xf0 0x00 0x0e 0x6d
-#CHECK: wcdgb %f0, %f0, 0, 0
+# CHECK: wcdgb %f0, %f0, 0, 0
0xe7 0x00 0x00 0x08 0x30 0xc3
-#CHECK: wcdgb %v19, %f14, 4, 10
+# CHECK: wcdgb %v19, %f14, 4, 10
0xe7 0x3e 0x00 0xac 0x38 0xc3
-#CHECK: wcdgb %v31, %v31, 7, 15
+# CHECK: wcdgb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xff 0x3c 0xc3
-#CHECK: wcdlgb %f0, %f0, 0, 0
+# CHECK: wcdlgb %f0, %f0, 0, 0
0xe7 0x00 0x00 0x08 0x30 0xc1
-#CHECK: wcdlgb %v19, %f14, 4, 10
+# CHECK: wcdlgb %v19, %f14, 4, 10
0xe7 0x3e 0x00 0xac 0x38 0xc1
-#CHECK: wcdlgb %v31, %v31, 7, 15
+# CHECK: wcdlgb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xff 0x3c 0xc1
-#CHECK: wcgdb %f0, %f0, 0, 0
+# CHECK: wcgdb %f0, %f0, 0, 0
0xe7 0x00 0x00 0x08 0x30 0xc2
-#CHECK: wcgdb %v19, %f14, 4, 10
+# CHECK: wcgdb %v19, %f14, 4, 10
0xe7 0x3e 0x00 0xac 0x38 0xc2
-#CHECK: wcgdb %v31, %v31, 7, 15
+# CHECK: wcgdb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xff 0x3c 0xc2
-#CHECK: wclgdb %f0, %f0, 0, 0
+# CHECK: wclgdb %f0, %f0, 0, 0
0xe7 0x00 0x00 0x08 0x30 0xc0
-#CHECK: wclgdb %v19, %f14, 4, 10
+# CHECK: wclgdb %v19, %f14, 4, 10
0xe7 0x3e 0x00 0xac 0x38 0xc0
-#CHECK: wclgdb %v31, %v31, 7, 15
+# CHECK: wclgdb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xff 0x3c 0xc0
-#CHECK: wfadb %f0, %f0, %f0
+# CHECK: wfadb %f0, %f0, %f0
0xe7 0x00 0x00 0x08 0x30 0xe3
-#CHECK: wfadb %v18, %f3, %v20
+# CHECK: wfadb %v18, %f3, %v20
0xe7 0x23 0x40 0x08 0x3a 0xe3
-#CHECK: wfadb %v31, %v31, %v31
+# CHECK: wfadb %v31, %v31, %v31
0xe7 0xff 0xf0 0x08 0x3e 0xe3
-#CHECK: wfc %f0, %f0, 11, 9
+# CHECK: wfc %f0, %f0, 11, 9
0xe7 0x00 0x00 0x09 0xb0 0xcb
-#CHECK: wfc %v19, %f14, 11, 9
+# CHECK: wfc %v19, %f14, 11, 9
0xe7 0x3e 0x00 0x09 0xb8 0xcb
-#CHECK: wfc %v31, %v31, 11, 9
+# CHECK: wfc %v31, %v31, 11, 9
0xe7 0xff 0x00 0x09 0xbc 0xcb
-#CHECK: wfcdb %f0, %f0
+# CHECK: wfcdb %f0, %f0
0xe7 0x00 0x00 0x00 0x30 0xcb
-#CHECK: wfcdb %v19, %f14
+# CHECK: wfcdb %v19, %f14
0xe7 0x3e 0x00 0x00 0x38 0xcb
-#CHECK: wfcdb %v31, %v31
+# CHECK: wfcdb %v31, %v31
0xe7 0xff 0x00 0x00 0x3c 0xcb
-#CHECK: wfcedb %f0, %f0, %f0
+# CHECK: wfcedb %f0, %f0, %f0
0xe7 0x00 0x00 0x08 0x30 0xe8
-#CHECK: wfcedb %v18, %f3, %v20
+# CHECK: wfcedb %v18, %f3, %v20
0xe7 0x23 0x40 0x08 0x3a 0xe8
-#CHECK: wfcedb %v31, %v31, %v31
+# CHECK: wfcedb %v31, %v31, %v31
0xe7 0xff 0xf0 0x08 0x3e 0xe8
-#CHECK: wfcedbs %f0, %f0, %f0
+# CHECK: wfcedbs %f0, %f0, %f0
0xe7 0x00 0x00 0x18 0x30 0xe8
-#CHECK: wfcedbs %v18, %f3, %v20
+# CHECK: wfcedbs %v18, %f3, %v20
0xe7 0x23 0x40 0x18 0x3a 0xe8
-#CHECK: wfcedbs %v31, %v31, %v31
+# CHECK: wfcedbs %v31, %v31, %v31
0xe7 0xff 0xf0 0x18 0x3e 0xe8
-#CHECK: wfchdb %f0, %f0, %f0
+# CHECK: wfchdb %f0, %f0, %f0
0xe7 0x00 0x00 0x08 0x30 0xeb
-#CHECK: wfchdb %v18, %f3, %v20
+# CHECK: wfchdb %v18, %f3, %v20
0xe7 0x23 0x40 0x08 0x3a 0xeb
-#CHECK: wfchdb %v31, %v31, %v31
+# CHECK: wfchdb %v31, %v31, %v31
0xe7 0xff 0xf0 0x08 0x3e 0xeb
-#CHECK: wfchdbs %f0, %f0, %f0
+# CHECK: wfchdbs %f0, %f0, %f0
0xe7 0x00 0x00 0x18 0x30 0xeb
-#CHECK: wfchdbs %v18, %f3, %v20
+# CHECK: wfchdbs %v18, %f3, %v20
0xe7 0x23 0x40 0x18 0x3a 0xeb
-#CHECK: wfchdbs %v31, %v31, %v31
+# CHECK: wfchdbs %v31, %v31, %v31
0xe7 0xff 0xf0 0x18 0x3e 0xeb
-#CHECK: wfchedb %f0, %f0, %f0
+# CHECK: wfchedb %f0, %f0, %f0
0xe7 0x00 0x00 0x08 0x30 0xea
-#CHECK: wfchedb %v18, %f3, %v20
+# CHECK: wfchedb %v18, %f3, %v20
0xe7 0x23 0x40 0x08 0x3a 0xea
-#CHECK: wfchedb %v31, %v31, %v31
+# CHECK: wfchedb %v31, %v31, %v31
0xe7 0xff 0xf0 0x08 0x3e 0xea
-#CHECK: wfchedbs %f0, %f0, %f0
+# CHECK: wfchedbs %f0, %f0, %f0
0xe7 0x00 0x00 0x18 0x30 0xea
-#CHECK: wfchedbs %v18, %f3, %v20
+# CHECK: wfchedbs %v18, %f3, %v20
0xe7 0x23 0x40 0x18 0x3a 0xea
-#CHECK: wfchedbs %v31, %v31, %v31
+# CHECK: wfchedbs %v31, %v31, %v31
0xe7 0xff 0xf0 0x18 0x3e 0xea
-#CHECK: wfddb %f0, %f0, %f0
+# CHECK: wfddb %f0, %f0, %f0
0xe7 0x00 0x00 0x08 0x30 0xe5
-#CHECK: wfddb %v18, %f3, %v20
+# CHECK: wfddb %v18, %f3, %v20
0xe7 0x23 0x40 0x08 0x3a 0xe5
-#CHECK: wfddb %v31, %v31, %v31
+# CHECK: wfddb %v31, %v31, %v31
0xe7 0xff 0xf0 0x08 0x3e 0xe5
-#CHECK: wfidb %f0, %f0, 0, 0
+# CHECK: wfidb %f0, %f0, 0, 0
0xe7 0x00 0x00 0x08 0x30 0xc7
-#CHECK: wfidb %v19, %f14, 4, 10
+# CHECK: wfidb %v19, %f14, 4, 10
0xe7 0x3e 0x00 0xac 0x38 0xc7
-#CHECK: wfidb %v31, %v31, 7, 15
+# CHECK: wfidb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xff 0x3c 0xc7
-#CHECK: wfk %f0, %f0, 11, 9
+# CHECK: wfk %f0, %f0, 11, 9
0xe7 0x00 0x00 0x09 0xb0 0xca
-#CHECK: wfk %v19, %f14, 11, 9
+# CHECK: wfk %v19, %f14, 11, 9
0xe7 0x3e 0x00 0x09 0xb8 0xca
-#CHECK: wfk %v31, %v31, 11, 9
+# CHECK: wfk %v31, %v31, 11, 9
0xe7 0xff 0x00 0x09 0xbc 0xca
-#CHECK: wfkdb %f0, %f0
+# CHECK: wfkdb %f0, %f0
0xe7 0x00 0x00 0x00 0x30 0xca
-#CHECK: wfkdb %v19, %f14
+# CHECK: wfkdb %v19, %f14
0xe7 0x3e 0x00 0x00 0x38 0xca
-#CHECK: wfkdb %v31, %v31
+# CHECK: wfkdb %v31, %v31
0xe7 0xff 0x00 0x00 0x3c 0xca
-#CHECK: wfpsodb %f0, %f0, 7
-0xe7 0x00 0x00 0x78 0x30 0xcc
-
-#CHECK: wfpsodb %v19, %f14, 7
-0xe7 0x3e 0x00 0x78 0x38 0xcc
-
-#CHECK: wfpsodb %v31, %v31, 7
-0xe7 0xff 0x00 0x78 0x3c 0xcc
-
-#CHECK: wflcdb %f0, %f0
+# CHECK: wflcdb %f0, %f0
0xe7 0x00 0x00 0x08 0x30 0xcc
-#CHECK: wflcdb %v19, %f14
+# CHECK: wflcdb %v19, %f14
0xe7 0x3e 0x00 0x08 0x38 0xcc
-#CHECK: wflcdb %v31, %v31
+# CHECK: wflcdb %v31, %v31
0xe7 0xff 0x00 0x08 0x3c 0xcc
-#CHECK: wflndb %f0, %f0
+# CHECK: wflndb %f0, %f0
0xe7 0x00 0x00 0x18 0x30 0xcc
-#CHECK: wflndb %v19, %f14
+# CHECK: wflndb %v19, %f14
0xe7 0x3e 0x00 0x18 0x38 0xcc
-#CHECK: wflndb %v31, %v31
+# CHECK: wflndb %v31, %v31
0xe7 0xff 0x00 0x18 0x3c 0xcc
-#CHECK: wflpdb %f0, %f0
+# CHECK: wflpdb %f0, %f0
0xe7 0x00 0x00 0x28 0x30 0xcc
-#CHECK: wflpdb %v19, %f14
+# CHECK: wflpdb %v19, %f14
0xe7 0x3e 0x00 0x28 0x38 0xcc
-#CHECK: wflpdb %v31, %v31
+# CHECK: wflpdb %v31, %v31
0xe7 0xff 0x00 0x28 0x3c 0xcc
-#CHECK: wfmadb %f0, %f0, %f0, %f0
+# CHECK: wfmadb %f0, %f0, %f0, %f0
0xe7 0x00 0x03 0x08 0x00 0x8f
-#CHECK: wfmadb %f3, %v20, %f5, %v22
+# CHECK: wfmadb %f3, %v20, %f5, %v22
0xe7 0x34 0x53 0x08 0x65 0x8f
-#CHECK: wfmadb %v31, %v31, %v31, %v31
+# CHECK: wfmadb %v31, %v31, %v31, %v31
0xe7 0xff 0xf3 0x08 0xff 0x8f
-#CHECK: wfmdb %f0, %f0, %f0
+# CHECK: wfmdb %f0, %f0, %f0
0xe7 0x00 0x00 0x08 0x30 0xe7
-#CHECK: wfmdb %v18, %f3, %v20
+# CHECK: wfmdb %v18, %f3, %v20
0xe7 0x23 0x40 0x08 0x3a 0xe7
-#CHECK: wfmdb %v31, %v31, %v31
+# CHECK: wfmdb %v31, %v31, %v31
0xe7 0xff 0xf0 0x08 0x3e 0xe7
-#CHECK: wfmsdb %f0, %f0, %f0, %f0
+# CHECK: wfmsdb %f0, %f0, %f0, %f0
0xe7 0x00 0x03 0x08 0x00 0x8e
-#CHECK: wfmsdb %f3, %v20, %f5, %v22
+# CHECK: wfmsdb %f3, %v20, %f5, %v22
0xe7 0x34 0x53 0x08 0x65 0x8e
-#CHECK: wfmsdb %v31, %v31, %v31, %v31
+# CHECK: wfmsdb %v31, %v31, %v31, %v31
0xe7 0xff 0xf3 0x08 0xff 0x8e
-#CHECK: wfsdb %f0, %f0, %f0
+# CHECK: wfpsodb %f0, %f0, 7
+0xe7 0x00 0x00 0x78 0x30 0xcc
+
+# CHECK: wfpsodb %v19, %f14, 7
+0xe7 0x3e 0x00 0x78 0x38 0xcc
+
+# CHECK: wfpsodb %v31, %v31, 7
+0xe7 0xff 0x00 0x78 0x3c 0xcc
+
+# CHECK: wfsdb %f0, %f0, %f0
0xe7 0x00 0x00 0x08 0x30 0xe2
-#CHECK: wfsdb %v18, %f3, %v20
+# CHECK: wfsdb %v18, %f3, %v20
0xe7 0x23 0x40 0x08 0x3a 0xe2
-#CHECK: wfsdb %v31, %v31, %v31
+# CHECK: wfsdb %v31, %v31, %v31
0xe7 0xff 0xf0 0x08 0x3e 0xe2
-#CHECK: wfsqdb %f0, %f0
+# CHECK: wfsqdb %f0, %f0
0xe7 0x00 0x00 0x08 0x30 0xce
-#CHECK: wfsqdb %v19, %f14
+# CHECK: wfsqdb %v19, %f14
0xe7 0x3e 0x00 0x08 0x38 0xce
-#CHECK: wfsqdb %v31, %v31
+# CHECK: wfsqdb %v31, %v31
0xe7 0xff 0x00 0x08 0x3c 0xce
-#CHECK: wftcidb %f0, %f0, 0
+# CHECK: wftcidb %f0, %f0, 0
0xe7 0x00 0x00 0x08 0x30 0x4a
-#CHECK: wftcidb %v19, %f4, 1383
+# CHECK: wftcidb %v19, %f4, 1383
0xe7 0x34 0x56 0x78 0x38 0x4a
-#CHECK: wftcidb %v31, %v31, 4095
+# CHECK: wftcidb %v31, %v31, 4095
0xe7 0xff 0xff 0xf8 0x3c 0x4a
-#CHECK: wldeb %f0, %f0
+# CHECK: wldeb %f0, %f0
0xe7 0x00 0x00 0x08 0x20 0xc4
-#CHECK: wldeb %v19, %f14
+# CHECK: wldeb %v19, %f14
0xe7 0x3e 0x00 0x08 0x28 0xc4
-#CHECK: wldeb %v31, %v31
+# CHECK: wldeb %v31, %v31
0xe7 0xff 0x00 0x08 0x2c 0xc4
-#CHECK: wledb %f0, %f0, 0, 0
+# CHECK: wledb %f0, %f0, 0, 0
0xe7 0x00 0x00 0x08 0x30 0xc5
-#CHECK: wledb %v19, %f14, 4, 10
+# CHECK: wledb %v19, %f14, 4, 10
0xe7 0x3e 0x00 0xac 0x38 0xc5
-#CHECK: wledb %v31, %v31, 7, 15
+# CHECK: wledb %v31, %v31, 7, 15
0xe7 0xff 0x00 0xff 0x3c 0xc5
-#CHECK: lochi %r11, 42, 0
-0xec 0xb0 0x00 0x2a 0x00 0x42
-
-#CHECK: lochio %r11, 42
-0xec 0xb1 0x00 0x2a 0x00 0x42
-
-#CHECK: lochih %r11, 42
-0xec 0xb2 0x00 0x2a 0x00 0x42
-
-#CHECK: lochinle %r11, 42
-0xec 0xb3 0x00 0x2a 0x00 0x42
-
-#CHECK: lochil %r11, -1
-0xec 0xb4 0xff 0xff 0x00 0x42
-
-#CHECK: lochinhe %r11, 42
-0xec 0xb5 0x00 0x2a 0x00 0x42
-
-#CHECK: lochilh %r11, -1
-0xec 0xb6 0xff 0xff 0x00 0x42
-
-#CHECK: lochine %r11, 0
-0xec 0xb7 0x00 0x00 0x00 0x42
-
-#CHECK: lochie %r11, 0
-0xec 0xb8 0x00 0x00 0x00 0x42
-
-#CHECK: lochinlh %r11, 42
-0xec 0xb9 0x00 0x2a 0x00 0x42
-
-#CHECK: lochihe %r11, 255
-0xec 0xba 0x00 0xff 0x00 0x42
-
-#CHECK: lochinl %r11, 255
-0xec 0xbb 0x00 0xff 0x00 0x42
-
-#CHECK: lochile %r11, 32767
-0xec 0xbc 0x7f 0xff 0x00 0x42
-
-#CHECK: lochinh %r11, 32767
-0xec 0xbd 0x7f 0xff 0x00 0x42
-
-#CHECK: lochino %r11, 32512
-0xec 0xbe 0x7f 0x00 0x00 0x42
-
-#CHECK: lochi %r11, 32512, 15
-0xec 0xbf 0x7f 0x00 0x00 0x42
-
-#CHECK: locghi %r11, 42, 0
-0xec 0xb0 0x00 0x2a 0x00 0x46
-
-#CHECK: locghio %r11, 42
-0xec 0xb1 0x00 0x2a 0x00 0x46
-
-#CHECK: locghih %r11, 42
-0xec 0xb2 0x00 0x2a 0x00 0x46
-
-#CHECK: locghinle %r11, 42
-0xec 0xb3 0x00 0x2a 0x00 0x46
-
-#CHECK: locghil %r11, -1
-0xec 0xb4 0xff 0xff 0x00 0x46
-
-#CHECK: locghinhe %r11, 42
-0xec 0xb5 0x00 0x2a 0x00 0x46
-
-#CHECK: locghilh %r11, -1
-0xec 0xb6 0xff 0xff 0x00 0x46
-
-#CHECK: locghine %r11, 0
-0xec 0xb7 0x00 0x00 0x00 0x46
-
-#CHECK: locghie %r11, 0
-0xec 0xb8 0x00 0x00 0x00 0x46
-
-#CHECK: locghinlh %r11, 42
-0xec 0xb9 0x00 0x2a 0x00 0x46
-
-#CHECK: locghihe %r11, 255
-0xec 0xba 0x00 0xff 0x00 0x46
-
-#CHECK: locghinl %r11, 255
-0xec 0xbb 0x00 0xff 0x00 0x46
-
-#CHECK: locghile %r11, 32767
-0xec 0xbc 0x7f 0xff 0x00 0x46
-
-#CHECK: locghinh %r11, 32767
-0xec 0xbd 0x7f 0xff 0x00 0x46
-
-#CHECK: locghino %r11, 32512
-0xec 0xbe 0x7f 0x00 0x00 0x46
-
-#CHECK: locghi %r11, 32512, 15
-0xec 0xbf 0x7f 0x00 0x00 0x46
-
-#CHECK: lochhi %r11, 42, 0
-0xec 0xb0 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhio %r11, 42
-0xec 0xb1 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhih %r11, 42
-0xec 0xb2 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhinle %r11, 42
-0xec 0xb3 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhil %r11, -1
-0xec 0xb4 0xff 0xff 0x00 0x4e
-
-#CHECK: lochhinhe %r11, 42
-0xec 0xb5 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhilh %r11, -1
-0xec 0xb6 0xff 0xff 0x00 0x4e
-
-#CHECK: lochhine %r11, 0
-0xec 0xb7 0x00 0x00 0x00 0x4e
-
-#CHECK: lochhie %r11, 0
-0xec 0xb8 0x00 0x00 0x00 0x4e
-
-#CHECK: lochhinlh %r11, 42
-0xec 0xb9 0x00 0x2a 0x00 0x4e
-
-#CHECK: lochhihe %r11, 255
-0xec 0xba 0x00 0xff 0x00 0x4e
-
-#CHECK: lochhinl %r11, 255
-0xec 0xbb 0x00 0xff 0x00 0x4e
-
-#CHECK: lochhile %r11, 32767
-0xec 0xbc 0x7f 0xff 0x00 0x4e
-
-#CHECK: lochhinh %r11, 32767
-0xec 0xbd 0x7f 0xff 0x00 0x4e
-
-#CHECK: lochhino %r11, 32512
-0xec 0xbe 0x7f 0x00 0x00 0x4e
-
-#CHECK: lochhi %r11, 32512, 15
-0xec 0xbf 0x7f 0x00 0x00 0x4e
-
-# CHECK: locfh %r7, 6399(%r8), 0
-0xeb 0x70 0x88 0xff 0x01 0xe0
-
-# CHECK: locfho %r7, 6399(%r8)
-0xeb 0x71 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhh %r7, 6399(%r8)
-0xeb 0x72 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhnle %r7, 6399(%r8)
-0xeb 0x73 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhl %r7, 6399(%r8)
-0xeb 0x74 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhnhe %r7, 6399(%r8)
-0xeb 0x75 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhlh %r7, 6399(%r8)
-0xeb 0x76 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhne %r7, 6399(%r8)
-0xeb 0x77 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhe %r7, 6399(%r8)
-0xeb 0x78 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhnlh %r7, 6399(%r8)
-0xeb 0x79 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhhe %r7, 6399(%r8)
-0xeb 0x7a 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhnl %r7, 6399(%r8)
-0xeb 0x7b 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhle %r7, 6399(%r8)
-0xeb 0x7c 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhnh %r7, 6399(%r8)
-0xeb 0x7d 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhno %r7, 6399(%r8)
-0xeb 0x7e 0x88 0xff 0x01 0xe0
-
-# CHECK: locfh %r7, 6399(%r8), 15
-0xeb 0x7f 0x88 0xff 0x01 0xe0
-
-# CHECK: locfhr %r11, %r3, 0
-0xb9 0xe0 0x00 0xb3
-
-# CHECK: locfhro %r11, %r3
-0xb9 0xe0 0x10 0xb3
-
-# CHECK: locfhrh %r11, %r3
-0xb9 0xe0 0x20 0xb3
-
-# CHECK: locfhrnle %r11, %r3
-0xb9 0xe0 0x30 0xb3
-
-# CHECK: locfhrl %r11, %r3
-0xb9 0xe0 0x40 0xb3
-
-# CHECK: locfhrnhe %r11, %r3
-0xb9 0xe0 0x50 0xb3
-
-# CHECK: locfhrlh %r11, %r3
-0xb9 0xe0 0x60 0xb3
-
-# CHECK: locfhrne %r11, %r3
-0xb9 0xe0 0x70 0xb3
-
-# CHECK: locfhre %r11, %r3
-0xb9 0xe0 0x80 0xb3
-
-# CHECK: locfhrnlh %r11, %r3
-0xb9 0xe0 0x90 0xb3
-
-# CHECK: locfhrhe %r11, %r3
-0xb9 0xe0 0xa0 0xb3
-
-# CHECK: locfhrnl %r11, %r3
-0xb9 0xe0 0xb0 0xb3
-
-# CHECK: locfhrle %r11, %r3
-0xb9 0xe0 0xc0 0xb3
-
-# CHECK: locfhrnh %r11, %r3
-0xb9 0xe0 0xd0 0xb3
-
-# CHECK: locfhrno %r11, %r3
-0xb9 0xe0 0xe0 0xb3
-
-# CHECK: locfhr %r11, %r3, 15
-0xb9 0xe0 0xf0 0xb3
-
-# CHECK: stocfh %r1, 2(%r3), 0
-0xeb 0x10 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfho %r1, 2(%r3)
-0xeb 0x11 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhh %r1, 2(%r3)
-0xeb 0x12 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhnle %r1, 2(%r3)
-0xeb 0x13 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhl %r1, 2(%r3)
-0xeb 0x14 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhnhe %r1, 2(%r3)
-0xeb 0x15 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhlh %r1, 2(%r3)
-0xeb 0x16 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhne %r1, 2(%r3)
-0xeb 0x17 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhe %r1, 2(%r3)
-0xeb 0x18 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhnlh %r1, 2(%r3)
-0xeb 0x19 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhhe %r1, 2(%r3)
-0xeb 0x1a 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhnl %r1, 2(%r3)
-0xeb 0x1b 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhle %r1, 2(%r3)
-0xeb 0x1c 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhnh %r1, 2(%r3)
-0xeb 0x1d 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfhno %r1, 2(%r3)
-0xeb 0x1e 0x30 0x02 0x00 0xe1
-
-# CHECK: stocfh %r1, 2(%r3), 15
-0xeb 0x1f 0x30 0x02 0x00 0xe1
-
diff --git a/test/MC/Disassembler/SystemZ/insns.txt b/test/MC/Disassembler/SystemZ/insns.txt
index 9f76b6a5fd44..dac94099f276 100644
--- a/test/MC/Disassembler/SystemZ/insns.txt
+++ b/test/MC/Disassembler/SystemZ/insns.txt
@@ -1,17 +1,26 @@
# Test instructions that don't have PC-relative operands.
# RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s
-# CHECK: adbr %f0, %f0
-0xb3 0x1a 0x00 0x00
+# CHECK: a %r0, 0
+0x5a 0x00 0x00 0x00
-# CHECK: adbr %f0, %f15
-0xb3 0x1a 0x00 0x0f
+# CHECK: a %r0, 4095
+0x5a 0x00 0x0f 0xff
-# CHECK: adbr %f7, %f8
-0xb3 0x1a 0x00 0x78
+# CHECK: a %r0, 0(%r1)
+0x5a 0x00 0x10 0x00
-# CHECK: adbr %f15, %f0
-0xb3 0x1a 0x00 0xf0
+# CHECK: a %r0, 0(%r15)
+0x5a 0x00 0xf0 0x00
+
+# CHECK: a %r0, 4095(%r1,%r15)
+0x5a 0x01 0xff 0xff
+
+# CHECK: a %r0, 4095(%r15,%r1)
+0x5a 0x0f 0x1f 0xff
+
+# CHECK: a %r15, 0
+0x5a 0xf0 0x00 0x00
# CHECK: adb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x1a
@@ -34,17 +43,17 @@
# CHECK: adb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x1a
-# CHECK: aebr %f0, %f0
-0xb3 0x0a 0x00 0x00
+# CHECK: adbr %f0, %f0
+0xb3 0x1a 0x00 0x00
-# CHECK: aebr %f0, %f15
-0xb3 0x0a 0x00 0x0f
+# CHECK: adbr %f0, %f15
+0xb3 0x1a 0x00 0x0f
-# CHECK: aebr %f7, %f8
-0xb3 0x0a 0x00 0x78
+# CHECK: adbr %f7, %f8
+0xb3 0x1a 0x00 0x78
-# CHECK: aebr %f15, %f0
-0xb3 0x0a 0x00 0xf0
+# CHECK: adbr %f15, %f0
+0xb3 0x1a 0x00 0xf0
# CHECK: aeb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x0a
@@ -67,6 +76,18 @@
# CHECK: aeb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x0a
+# CHECK: aebr %f0, %f0
+0xb3 0x0a 0x00 0x00
+
+# CHECK: aebr %f0, %f15
+0xb3 0x0a 0x00 0x0f
+
+# CHECK: aebr %f7, %f8
+0xb3 0x0a 0x00 0x78
+
+# CHECK: aebr %f15, %f0
+0xb3 0x0a 0x00 0xf0
+
# CHECK: afi %r0, -2147483648
0xc2 0x09 0x80 0x00 0x00 0x00
@@ -85,35 +106,35 @@
# CHECK: afi %r15, 0
0xc2 0xf9 0x00 0x00 0x00 0x00
-# CHECK: agfi %r0, -2147483648
-0xc2 0x08 0x80 0x00 0x00 0x00
+# CHECK: ag %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x08
-# CHECK: agfi %r0, -1
-0xc2 0x08 0xff 0xff 0xff 0xff
+# CHECK: ag %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x08
-# CHECK: agfi %r0, 0
-0xc2 0x08 0x00 0x00 0x00 0x00
+# CHECK: ag %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x08
-# CHECK: agfi %r0, 1
-0xc2 0x08 0x00 0x00 0x00 0x01
+# CHECK: ag %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x08
-# CHECK: agfi %r0, 2147483647
-0xc2 0x08 0x7f 0xff 0xff 0xff
+# CHECK: ag %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x08
-# CHECK: agfi %r15, 0
-0xc2 0xf8 0x00 0x00 0x00 0x00
+# CHECK: ag %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x08
-# CHECK: agfr %r0, %r0
-0xb9 0x18 0x00 0x00
+# CHECK: ag %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x08
-# CHECK: agfr %r0, %r15
-0xb9 0x18 0x00 0x0f
+# CHECK: ag %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x08
-# CHECK: agfr %r15, %r0
-0xb9 0x18 0x00 0xf0
+# CHECK: ag %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x08
-# CHECK: agfr %r7, %r8
-0xb9 0x18 0x00 0x78
+# CHECK: ag %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x08
# CHECK: agf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x18
@@ -145,6 +166,36 @@
# CHECK: agf %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x18
+# CHECK: agfi %r0, -2147483648
+0xc2 0x08 0x80 0x00 0x00 0x00
+
+# CHECK: agfi %r0, -1
+0xc2 0x08 0xff 0xff 0xff 0xff
+
+# CHECK: agfi %r0, 0
+0xc2 0x08 0x00 0x00 0x00 0x00
+
+# CHECK: agfi %r0, 1
+0xc2 0x08 0x00 0x00 0x00 0x01
+
+# CHECK: agfi %r0, 2147483647
+0xc2 0x08 0x7f 0xff 0xff 0xff
+
+# CHECK: agfi %r15, 0
+0xc2 0xf8 0x00 0x00 0x00 0x00
+
+# CHECK: agfr %r0, %r0
+0xb9 0x18 0x00 0x00
+
+# CHECK: agfr %r0, %r15
+0xb9 0x18 0x00 0x0f
+
+# CHECK: agfr %r15, %r0
+0xb9 0x18 0x00 0xf0
+
+# CHECK: agfr %r7, %r8
+0xb9 0x18 0x00 0x78
+
# CHECK: aghi %r0, -32768
0xa7 0x0b 0x80 0x00
@@ -235,35 +286,26 @@
# CHECK: agsi 524287(%r15), 42
0xeb 0x2a 0xff 0xff 0x7f 0x7a
-# CHECK: ag %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x08
-
-# CHECK: ag %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x08
-
-# CHECK: ag %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x08
-
-# CHECK: ag %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x08
+# CHECK: ah %r0, 0
+0x4a 0x00 0x00 0x00
-# CHECK: ag %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x08
+# CHECK: ah %r0, 4095
+0x4a 0x00 0x0f 0xff
-# CHECK: ag %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x08
+# CHECK: ah %r0, 0(%r1)
+0x4a 0x00 0x10 0x00
-# CHECK: ag %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x08
+# CHECK: ah %r0, 0(%r15)
+0x4a 0x00 0xf0 0x00
-# CHECK: ag %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x08
+# CHECK: ah %r0, 4095(%r1,%r15)
+0x4a 0x01 0xff 0xff
-# CHECK: ag %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x08
+# CHECK: ah %r0, 4095(%r15,%r1)
+0x4a 0x0f 0x1f 0xff
-# CHECK: ag %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x08
+# CHECK: ah %r15, 0
+0x4a 0xf0 0x00 0x00
# CHECK: ahi %r0, -32768
0xa7 0x0a 0x80 0x00
@@ -298,27 +340,6 @@
# CHECK: ahik %r8, %r15, 32767
0xec 0x8f 0x7f 0xff 0x00 0xd8
-# CHECK: ah %r0, 0
-0x4a 0x00 0x00 0x00
-
-# CHECK: ah %r0, 4095
-0x4a 0x00 0x0f 0xff
-
-# CHECK: ah %r0, 0(%r1)
-0x4a 0x00 0x10 0x00
-
-# CHECK: ah %r0, 0(%r15)
-0x4a 0x00 0xf0 0x00
-
-# CHECK: ah %r0, 4095(%r1,%r15)
-0x4a 0x01 0xff 0xff
-
-# CHECK: ah %r0, 4095(%r15,%r1)
-0x4a 0x0f 0x1f 0xff
-
-# CHECK: ah %r15, 0
-0x4a 0xf0 0x00 0x00
-
# CHECK: ahy %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x7a
@@ -367,17 +388,56 @@
# CHECK: aih %r15, 0
0xcc 0xf8 0x00 0x00 0x00 0x00
-# CHECK: alcgr %r0, %r0
-0xb9 0x88 0x00 0x00
+# CHECK: al %r0, 0
+0x5e 0x00 0x00 0x00
-# CHECK: alcgr %r0, %r15
-0xb9 0x88 0x00 0x0f
+# CHECK: al %r0, 4095
+0x5e 0x00 0x0f 0xff
-# CHECK: alcgr %r15, %r0
-0xb9 0x88 0x00 0xf0
+# CHECK: al %r0, 0(%r1)
+0x5e 0x00 0x10 0x00
-# CHECK: alcgr %r7, %r8
-0xb9 0x88 0x00 0x78
+# CHECK: al %r0, 0(%r15)
+0x5e 0x00 0xf0 0x00
+
+# CHECK: al %r0, 4095(%r1,%r15)
+0x5e 0x01 0xff 0xff
+
+# CHECK: al %r0, 4095(%r15,%r1)
+0x5e 0x0f 0x1f 0xff
+
+# CHECK: al %r15, 0
+0x5e 0xf0 0x00 0x00
+
+# CHECK: alc %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x98
+
+# CHECK: alc %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x98
+
+# CHECK: alc %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x98
+
+# CHECK: alc %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x98
+
+# CHECK: alc %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x98
+
+# CHECK: alc %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x98
+
+# CHECK: alc %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x98
+
+# CHECK: alc %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x98
+
+# CHECK: alc %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x98
+
+# CHECK: alc %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x98
# CHECK: alcg %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x88
@@ -409,6 +469,18 @@
# CHECK: alcg %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x88
+# CHECK: alcgr %r0, %r0
+0xb9 0x88 0x00 0x00
+
+# CHECK: alcgr %r0, %r15
+0xb9 0x88 0x00 0x0f
+
+# CHECK: alcgr %r15, %r0
+0xb9 0x88 0x00 0xf0
+
+# CHECK: alcgr %r7, %r8
+0xb9 0x88 0x00 0x78
+
# CHECK: alcr %r0, %r0
0xb9 0x98 0x00 0x00
@@ -421,36 +493,6 @@
# CHECK: alcr %r7, %r8
0xb9 0x98 0x00 0x78
-# CHECK: alc %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x98
-
-# CHECK: alc %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x98
-
-# CHECK: alc %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x98
-
-# CHECK: alc %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x98
-
-# CHECK: alc %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x98
-
-# CHECK: alc %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x98
-
-# CHECK: alc %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x98
-
-# CHECK: alc %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x98
-
-# CHECK: alc %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x98
-
-# CHECK: alc %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x98
-
# CHECK: alfi %r0, 0
0xc2 0x0b 0x00 0x00 0x00 0x00
@@ -460,26 +502,35 @@
# CHECK: alfi %r15, 0
0xc2 0xfb 0x00 0x00 0x00 0x00
-# CHECK: algfi %r0, 0
-0xc2 0x0a 0x00 0x00 0x00 0x00
+# CHECK: alg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x0a
-# CHECK: algfi %r0, 4294967295
-0xc2 0x0a 0xff 0xff 0xff 0xff
+# CHECK: alg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x0a
-# CHECK: algfi %r15, 0
-0xc2 0xfa 0x00 0x00 0x00 0x00
+# CHECK: alg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x0a
-# CHECK: algfr %r0, %r0
-0xb9 0x1a 0x00 0x00
+# CHECK: alg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x0a
-# CHECK: algfr %r0, %r15
-0xb9 0x1a 0x00 0x0f
+# CHECK: alg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x0a
-# CHECK: algfr %r15, %r0
-0xb9 0x1a 0x00 0xf0
+# CHECK: alg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x0a
-# CHECK: algfr %r7, %r8
-0xb9 0x1a 0x00 0x78
+# CHECK: alg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x0a
+
+# CHECK: alg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x0a
+
+# CHECK: alg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x0a
+
+# CHECK: alg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x0a
# CHECK: algf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x1a
@@ -511,6 +562,42 @@
# CHECK: algf %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x1a
+# CHECK: algfi %r0, 0
+0xc2 0x0a 0x00 0x00 0x00 0x00
+
+# CHECK: algfi %r0, 4294967295
+0xc2 0x0a 0xff 0xff 0xff 0xff
+
+# CHECK: algfi %r15, 0
+0xc2 0xfa 0x00 0x00 0x00 0x00
+
+# CHECK: algfr %r0, %r0
+0xb9 0x1a 0x00 0x00
+
+# CHECK: algfr %r0, %r15
+0xb9 0x1a 0x00 0x0f
+
+# CHECK: algfr %r15, %r0
+0xb9 0x1a 0x00 0xf0
+
+# CHECK: algfr %r7, %r8
+0xb9 0x1a 0x00 0x78
+
+# CHECK: alghsik %r0, %r1, -32768
+0xec 0x01 0x80 0x00 0x00 0xdb
+
+# CHECK: alghsik %r2, %r3, -1
+0xec 0x23 0xff 0xff 0x00 0xdb
+
+# CHECK: alghsik %r4, %r5, 0
+0xec 0x45 0x00 0x00 0x00 0xdb
+
+# CHECK: alghsik %r6, %r7, 1
+0xec 0x67 0x00 0x01 0x00 0xdb
+
+# CHECK: alghsik %r8, %r15, 32767
+0xec 0x8f 0x7f 0xff 0x00 0xdb
+
# CHECK: algr %r0, %r0
0xb9 0x0a 0x00 0x00
@@ -529,50 +616,44 @@
# CHECK: algrk %r2, %r3, %r4
0xb9 0xea 0x40 0x23
-# CHECK: alg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x0a
-
-# CHECK: alg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x0a
-
-# CHECK: alg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x0a
+# CHECK: algsi -524288, 0
+0xeb 0x00 0x00 0x00 0x80 0x7e
-# CHECK: alg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x0a
+# CHECK: algsi -1, 0
+0xeb 0x00 0x0f 0xff 0xff 0x7e
-# CHECK: alg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x0a
+# CHECK: algsi 0, 0
+0xeb 0x00 0x00 0x00 0x00 0x7e
-# CHECK: alg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x0a
+# CHECK: algsi 1, 0
+0xeb 0x00 0x00 0x01 0x00 0x7e
-# CHECK: alg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x0a
+# CHECK: algsi 524287, 0
+0xeb 0x00 0x0f 0xff 0x7f 0x7e
-# CHECK: alg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x0a
+# CHECK: algsi 0, -128
+0xeb 0x80 0x00 0x00 0x00 0x7e
-# CHECK: alg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x0a
+# CHECK: algsi 0, -1
+0xeb 0xff 0x00 0x00 0x00 0x7e
-# CHECK: alg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x0a
+# CHECK: algsi 0, 1
+0xeb 0x01 0x00 0x00 0x00 0x7e
-# CHECK: alghsik %r0, %r1, -32768
-0xec 0x01 0x80 0x00 0x00 0xdb
+# CHECK: algsi 0, 127
+0xeb 0x7f 0x00 0x00 0x00 0x7e
-# CHECK: alghsik %r2, %r3, -1
-0xec 0x23 0xff 0xff 0x00 0xdb
+# CHECK: algsi 0(%r1), 42
+0xeb 0x2a 0x10 0x00 0x00 0x7e
-# CHECK: alghsik %r4, %r5, 0
-0xec 0x45 0x00 0x00 0x00 0xdb
+# CHECK: algsi 0(%r15), 42
+0xeb 0x2a 0xf0 0x00 0x00 0x7e
-# CHECK: alghsik %r6, %r7, 1
-0xec 0x67 0x00 0x01 0x00 0xdb
+# CHECK: algsi 524287(%r1), 42
+0xeb 0x2a 0x1f 0xff 0x7f 0x7e
-# CHECK: alghsik %r8, %r15, 32767
-0xec 0x8f 0x7f 0xff 0x00 0xdb
+# CHECK: algsi 524287(%r15), 42
+0xeb 0x2a 0xff 0xff 0x7f 0x7e
# CHECK: alhsik %r0, %r1, -32768
0xec 0x01 0x80 0x00 0x00 0xda
@@ -607,26 +688,44 @@
# CHECK: alrk %r2, %r3, %r4
0xb9 0xfa 0x40 0x23
-# CHECK: al %r0, 0
-0x5e 0x00 0x00 0x00
+# CHECK: alsi -524288, 0
+0xeb 0x00 0x00 0x00 0x80 0x6e
-# CHECK: al %r0, 4095
-0x5e 0x00 0x0f 0xff
+# CHECK: alsi -1, 0
+0xeb 0x00 0x0f 0xff 0xff 0x6e
-# CHECK: al %r0, 0(%r1)
-0x5e 0x00 0x10 0x00
+# CHECK: alsi 0, 0
+0xeb 0x00 0x00 0x00 0x00 0x6e
-# CHECK: al %r0, 0(%r15)
-0x5e 0x00 0xf0 0x00
+# CHECK: alsi 1, 0
+0xeb 0x00 0x00 0x01 0x00 0x6e
-# CHECK: al %r0, 4095(%r1,%r15)
-0x5e 0x01 0xff 0xff
+# CHECK: alsi 524287, 0
+0xeb 0x00 0x0f 0xff 0x7f 0x6e
-# CHECK: al %r0, 4095(%r15,%r1)
-0x5e 0x0f 0x1f 0xff
+# CHECK: alsi 0, -128
+0xeb 0x80 0x00 0x00 0x00 0x6e
-# CHECK: al %r15, 0
-0x5e 0xf0 0x00 0x00
+# CHECK: alsi 0, -1
+0xeb 0xff 0x00 0x00 0x00 0x6e
+
+# CHECK: alsi 0, 1
+0xeb 0x01 0x00 0x00 0x00 0x6e
+
+# CHECK: alsi 0, 127
+0xeb 0x7f 0x00 0x00 0x00 0x6e
+
+# CHECK: alsi 0(%r1), 42
+0xeb 0x2a 0x10 0x00 0x00 0x6e
+
+# CHECK: alsi 0(%r15), 42
+0xeb 0x2a 0xf0 0x00 0x00 0x6e
+
+# CHECK: alsi 524287(%r1), 42
+0xeb 0x2a 0x1f 0xff 0x7f 0x6e
+
+# CHECK: alsi 524287(%r15), 42
+0xeb 0x2a 0xff 0xff 0x7f 0x6e
# CHECK: aly %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x5e
@@ -658,6 +757,48 @@
# CHECK: aly %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x5e
+# CHECK: ap 0(1), 0(1)
+0xfa 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: ap 0(1), 0(1,%r1)
+0xfa 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: ap 0(1), 0(1,%r15)
+0xfa 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: ap 0(1), 4095(1)
+0xfa 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: ap 0(1), 4095(1,%r1)
+0xfa 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: ap 0(1), 4095(1,%r15)
+0xfa 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: ap 0(1,%r1), 0(1)
+0xfa 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: ap 0(1,%r15), 0(1)
+0xfa 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: ap 4095(1,%r1), 0(1)
+0xfa 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: ap 4095(1,%r15), 0(1)
+0xfa 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: ap 0(16,%r1), 0(1)
+0xfa 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: ap 0(16,%r15), 0(1)
+0xfa 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: ap 0(1), 0(16,%r1)
+0xfa 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: ap 0(1), 0(16,%r15)
+0xfa 0x0f 0x00 0x00 0xf0 0x00
+
# CHECK: ar %r0, %r0
0x1a 0x00
@@ -715,27 +856,6 @@
# CHECK: asi 524287(%r15), 42
0xeb 0x2a 0xff 0xff 0x7f 0x6a
-# CHECK: a %r0, 0
-0x5a 0x00 0x00 0x00
-
-# CHECK: a %r0, 4095
-0x5a 0x00 0x0f 0xff
-
-# CHECK: a %r0, 0(%r1)
-0x5a 0x00 0x10 0x00
-
-# CHECK: a %r0, 0(%r15)
-0x5a 0x00 0xf0 0x00
-
-# CHECK: a %r0, 4095(%r1,%r15)
-0x5a 0x01 0xff 0xff
-
-# CHECK: a %r0, 4095(%r15,%r1)
-0x5a 0x0f 0x1f 0xff
-
-# CHECK: a %r15, 0
-0x5a 0xf0 0x00 0x00
-
# CHECK: axbr %f0, %f0
0xb3 0x4a 0x00 0x00
@@ -778,6 +898,24 @@
# CHECK: ay %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x5a
+# CHECK: b 0
+0x47 0xf0 0x00 0x00
+
+# CHECK: b 4095
+0x47 0xf0 0x0f 0xff
+
+# CHECK: b 0(%r1)
+0x47 0xf0 0x10 0x00
+
+# CHECK: b 0(%r15)
+0x47 0xf0 0xf0 0x00
+
+# CHECK: b 4095(%r1,%r15)
+0x47 0xf1 0xff 0xff
+
+# CHECK: b 4095(%r15,%r1)
+0x47 0xff 0x1f 0xff
+
# CHECK: bal %r0, 0
0x45 0x00 0x00 0x00
@@ -850,36 +988,6 @@
# CHECK: bassm %r15, %r1
0x0c 0xf1
-# CHECK: bsm %r0, %r1
-0x0b 0x01
-
-# CHECK: bsm %r0, %r15
-0x0b 0x0f
-
-# CHECK: bsm %r14, %r9
-0x0b 0xe9
-
-# CHECK: bsm %r15, %r1
-0x0b 0xf1
-
-# CHECK: b 0
-0x47 0xf0 0x00 0x00
-
-# CHECK: b 4095
-0x47 0xf0 0x0f 0xff
-
-# CHECK: b 0(%r1)
-0x47 0xf0 0x10 0x00
-
-# CHECK: b 0(%r15)
-0x47 0xf0 0xf0 0x00
-
-# CHECK: b 4095(%r1,%r15)
-0x47 0xf1 0xff 0xff
-
-# CHECK: b 4095(%r15,%r1)
-0x47 0xff 0x1f 0xff
-
# CHECK: bc 0, 0
0x47 0x00 0x00 0x00
@@ -1015,18 +1123,6 @@
# CHECK: bct %r15, 0
0x46 0xf0 0x00 0x00
-# CHECK: bctr %r0, %r9
-0x06 0x09
-
-# CHECK: bctr %r0, %r15
-0x06 0x0f
-
-# CHECK: bctr %r15, %r0
-0x06 0xf0
-
-# CHECK: bctr %r15, %r9
-0x06 0xf9
-
# CHECK: bctg %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x46
@@ -1069,6 +1165,30 @@
# CHECK: bctgr %r15, %r9
0xb9 0x46 0x00 0xf9
+# CHECK: bctr %r0, %r9
+0x06 0x09
+
+# CHECK: bctr %r0, %r15
+0x06 0x0f
+
+# CHECK: bctr %r15, %r0
+0x06 0xf0
+
+# CHECK: bctr %r15, %r9
+0x06 0xf9
+
+# CHECK: bsm %r0, %r1
+0x0b 0x01
+
+# CHECK: bsm %r0, %r15
+0x0b 0x0f
+
+# CHECK: bsm %r14, %r9
+0x0b 0xe9
+
+# CHECK: bsm %r15, %r1
+0x0b 0xf1
+
# CHECK: bxh %r0, %r0, 0
0x86 0x00 0x00 0x00
@@ -1177,7 +1297,7 @@
# CHECK: bxleg %r14, %r15, 0
0xeb 0xef 0x00 0x00 0x00 0x45
-# CHECK: bxleg %r15, %r15, 0
+# CHECK: bxleg %r15, %r15, 0
0xeb 0xff 0x00 0x00 0x00 0x45
# CHECK: bxleg %r0, %r0, -524288
@@ -1207,17 +1327,26 @@
# CHECK: bxleg %r0, %r0, 524287(%r15)
0xeb 0x00 0xff 0xff 0x7f 0x45
-# CHECK: cdbr %f0, %f0
-0xb3 0x19 0x00 0x00
+# CHECK: c %r0, 0
+0x59 0x00 0x00 0x00
-# CHECK: cdbr %f0, %f15
-0xb3 0x19 0x00 0x0f
+# CHECK: c %r0, 4095
+0x59 0x00 0x0f 0xff
-# CHECK: cdbr %f7, %f8
-0xb3 0x19 0x00 0x78
+# CHECK: c %r0, 0(%r1)
+0x59 0x00 0x10 0x00
-# CHECK: cdbr %f15, %f0
-0xb3 0x19 0x00 0xf0
+# CHECK: c %r0, 0(%r15)
+0x59 0x00 0xf0 0x00
+
+# CHECK: c %r0, 4095(%r1,%r15)
+0x59 0x01 0xff 0xff
+
+# CHECK: c %r0, 4095(%r15,%r1)
+0x59 0x0f 0x1f 0xff
+
+# CHECK: c %r15, 0
+0x59 0xf0 0x00 0x00
# CHECK: cdb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x19
@@ -1240,6 +1369,18 @@
# CHECK: cdb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x19
+# CHECK: cdbr %f0, %f0
+0xb3 0x19 0x00 0x00
+
+# CHECK: cdbr %f0, %f15
+0xb3 0x19 0x00 0x0f
+
+# CHECK: cdbr %f7, %f8
+0xb3 0x19 0x00 0x78
+
+# CHECK: cdbr %f15, %f0
+0xb3 0x19 0x00 0xf0
+
# CHECK: cdfbr %f0, %r0
0xb3 0x95 0x00 0x00
@@ -1255,22 +1396,22 @@
# CHECK: cdfbr %f15, %r15
0xb3 0x95 0x00 0xff
-# CHECK: cdfbra %f0, 0, %r0, 1
+# CHECK: cdfbra %f0, 0, %r0, 1
0xb3 0x95 0x01 0x00
-# CHECK: cdfbra %f0, 0, %r0, 15
+# CHECK: cdfbra %f0, 0, %r0, 15
0xb3 0x95 0x0f 0x00
-# CHECK: cdfbra %f0, 0, %r15, 1
+# CHECK: cdfbra %f0, 0, %r15, 1
0xb3 0x95 0x01 0x0f
-# CHECK: cdfbra %f0, 15, %r0, 1
+# CHECK: cdfbra %f0, 15, %r0, 1
0xb3 0x95 0xf1 0x00
-# CHECK: cdfbra %f4, 5, %r6, 7
+# CHECK: cdfbra %f4, 5, %r6, 7
0xb3 0x95 0x57 0x46
-# CHECK: cdfbra %f15, 0, %r0, 1
+# CHECK: cdfbra %f15, 0, %r0, 1
0xb3 0x95 0x01 0xf0
# CHECK: cdgbr %f0, %r0
@@ -1288,58 +1429,58 @@
# CHECK: cdgbr %f15, %r15
0xb3 0xa5 0x00 0xff
-# CHECK: cdgbra %f0, 0, %r0, 1
+# CHECK: cdgbra %f0, 0, %r0, 1
0xb3 0xa5 0x01 0x00
-# CHECK: cdgbra %f0, 0, %r0, 15
+# CHECK: cdgbra %f0, 0, %r0, 15
0xb3 0xa5 0x0f 0x00
-# CHECK: cdgbra %f0, 0, %r15, 1
+# CHECK: cdgbra %f0, 0, %r15, 1
0xb3 0xa5 0x01 0x0f
-# CHECK: cdgbra %f0, 15, %r0, 1
+# CHECK: cdgbra %f0, 15, %r0, 1
0xb3 0xa5 0xf1 0x00
-# CHECK: cdgbra %f4, 5, %r6, 7
+# CHECK: cdgbra %f4, 5, %r6, 7
0xb3 0xa5 0x57 0x46
-# CHECK: cdgbra %f15, 0, %r0, 1
+# CHECK: cdgbra %f15, 0, %r0, 1
0xb3 0xa5 0x01 0xf0
-# CHECK: cdlfbr %f0, 0, %r0, 1
+# CHECK: cdlfbr %f0, 0, %r0, 1
0xb3 0x91 0x01 0x00
-# CHECK: cdlfbr %f0, 0, %r0, 15
+# CHECK: cdlfbr %f0, 0, %r0, 15
0xb3 0x91 0x0f 0x00
-# CHECK: cdlfbr %f0, 0, %r15, 1
+# CHECK: cdlfbr %f0, 0, %r15, 1
0xb3 0x91 0x01 0x0f
-# CHECK: cdlfbr %f0, 15, %r0, 1
+# CHECK: cdlfbr %f0, 15, %r0, 1
0xb3 0x91 0xf1 0x00
-# CHECK: cdlfbr %f4, 5, %r6, 7
+# CHECK: cdlfbr %f4, 5, %r6, 7
0xb3 0x91 0x57 0x46
-# CHECK: cdlfbr %f15, 0, %r0, 1
+# CHECK: cdlfbr %f15, 0, %r0, 1
0xb3 0x91 0x01 0xf0
-# CHECK: cdlgbr %f0, 0, %r0, 1
+# CHECK: cdlgbr %f0, 0, %r0, 1
0xb3 0xa1 0x01 0x00
-# CHECK: cdlgbr %f0, 0, %r0, 15
+# CHECK: cdlgbr %f0, 0, %r0, 15
0xb3 0xa1 0x0f 0x00
-# CHECK: cdlgbr %f0, 0, %r15, 1
+# CHECK: cdlgbr %f0, 0, %r15, 1
0xb3 0xa1 0x01 0x0f
-# CHECK: cdlgbr %f0, 15, %r0, 1
+# CHECK: cdlgbr %f0, 15, %r0, 1
0xb3 0xa1 0xf1 0x00
-# CHECK: cdlgbr %f4, 5, %r6, 7
+# CHECK: cdlgbr %f4, 5, %r6, 7
0xb3 0xa1 0x57 0x46
-# CHECK: cdlgbr %f15, 0, %r0, 1
+# CHECK: cdlgbr %f15, 0, %r0, 1
0xb3 0xa1 0x01 0xf0
# CHECK: cds %r0, %r0, 0
@@ -1432,18 +1573,6 @@
# CHECK: cdsy %r14, %r0, 0
0xeb 0xe0 0x00 0x00 0x00 0x31
-# CHECK: cebr %f0, %f0
-0xb3 0x09 0x00 0x00
-
-# CHECK: cebr %f0, %f15
-0xb3 0x09 0x00 0x0f
-
-# CHECK: cebr %f7, %f8
-0xb3 0x09 0x00 0x78
-
-# CHECK: cebr %f15, %f0
-0xb3 0x09 0x00 0xf0
-
# CHECK: ceb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x09
@@ -1465,6 +1594,18 @@
# CHECK: ceb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x09
+# CHECK: cebr %f0, %f0
+0xb3 0x09 0x00 0x00
+
+# CHECK: cebr %f0, %f15
+0xb3 0x09 0x00 0x0f
+
+# CHECK: cebr %f7, %f8
+0xb3 0x09 0x00 0x78
+
+# CHECK: cebr %f15, %f0
+0xb3 0x09 0x00 0xf0
+
# CHECK: cefbr %f0, %r0
0xb3 0x94 0x00 0x00
@@ -1480,22 +1621,22 @@
# CHECK: cefbr %f15, %r15
0xb3 0x94 0x00 0xff
-# CHECK: cefbra %f0, 0, %r0, 1
+# CHECK: cefbra %f0, 0, %r0, 1
0xb3 0x94 0x01 0x00
-# CHECK: cefbra %f0, 0, %r0, 15
+# CHECK: cefbra %f0, 0, %r0, 15
0xb3 0x94 0x0f 0x00
-# CHECK: cefbra %f0, 0, %r15, 1
+# CHECK: cefbra %f0, 0, %r15, 1
0xb3 0x94 0x01 0x0f
-# CHECK: cefbra %f0, 15, %r0, 1
+# CHECK: cefbra %f0, 15, %r0, 1
0xb3 0x94 0xf1 0x00
-# CHECK: cefbra %f4, 5, %r6, 7
+# CHECK: cefbra %f4, 5, %r6, 7
0xb3 0x94 0x57 0x46
-# CHECK: cefbra %f15, 0, %r0, 1
+# CHECK: cefbra %f15, 0, %r0, 1
0xb3 0x94 0x01 0xf0
# CHECK: cegbr %f0, %r0
@@ -1513,60 +1654,78 @@
# CHECK: cegbr %f15, %r15
0xb3 0xa4 0x00 0xff
-# CHECK: cegbra %f0, 0, %r0, 1
+# CHECK: cegbra %f0, 0, %r0, 1
0xb3 0xa4 0x01 0x00
-# CHECK: cegbra %f0, 0, %r0, 15
+# CHECK: cegbra %f0, 0, %r0, 15
0xb3 0xa4 0x0f 0x00
-# CHECK: cegbra %f0, 0, %r15, 1
+# CHECK: cegbra %f0, 0, %r15, 1
0xb3 0xa4 0x01 0x0f
-# CHECK: cegbra %f0, 15, %r0, 1
+# CHECK: cegbra %f0, 15, %r0, 1
0xb3 0xa4 0xf1 0x00
-# CHECK: cegbra %f4, 5, %r6, 7
+# CHECK: cegbra %f4, 5, %r6, 7
0xb3 0xa4 0x57 0x46
-# CHECK: cegbra %f15, 0, %r0, 1
+# CHECK: cegbra %f15, 0, %r0, 1
0xb3 0xa4 0x01 0xf0
-# CHECK: celfbr %f0, 0, %r0, 1
+# CHECK: celfbr %f0, 0, %r0, 1
0xb3 0x90 0x01 0x00
-# CHECK: celfbr %f0, 0, %r0, 15
+# CHECK: celfbr %f0, 0, %r0, 15
0xb3 0x90 0x0f 0x00
-# CHECK: celfbr %f0, 0, %r15, 1
+# CHECK: celfbr %f0, 0, %r15, 1
0xb3 0x90 0x01 0x0f
-# CHECK: celfbr %f0, 15, %r0, 1
+# CHECK: celfbr %f0, 15, %r0, 1
0xb3 0x90 0xf1 0x00
-# CHECK: celfbr %f4, 5, %r6, 7
+# CHECK: celfbr %f4, 5, %r6, 7
0xb3 0x90 0x57 0x46
-# CHECK: celfbr %f15, 0, %r0, 1
+# CHECK: celfbr %f15, 0, %r0, 1
0xb3 0x90 0x01 0xf0
-# CHECK: celgbr %f0, 0, %r0, 1
+# CHECK: celgbr %f0, 0, %r0, 1
0xb3 0xa0 0x01 0x00
-# CHECK: celgbr %f0, 0, %r0, 15
+# CHECK: celgbr %f0, 0, %r0, 15
0xb3 0xa0 0x0f 0x00
-# CHECK: celgbr %f0, 0, %r15, 1
+# CHECK: celgbr %f0, 0, %r15, 1
0xb3 0xa0 0x01 0x0f
-# CHECK: celgbr %f0, 15, %r0, 1
+# CHECK: celgbr %f0, 15, %r0, 1
0xb3 0xa0 0xf1 0x00
-# CHECK: celgbr %f4, 5, %r6, 7
+# CHECK: celgbr %f4, 5, %r6, 7
0xb3 0xa0 0x57 0x46
-# CHECK: celgbr %f15, 0, %r0, 1
+# CHECK: celgbr %f15, 0, %r0, 1
0xb3 0xa0 0x01 0xf0
+# CHECK: cfc 0
+0xb2 0x1a 0x00 0x00
+
+# CHECK: cfc 0(%r1)
+0xb2 0x1a 0x10 0x00
+
+# CHECK: cfc 0(%r15)
+0xb2 0x1a 0xf0 0x00
+
+# CHECK: cfc 4095
+0xb2 0x1a 0x0f 0xff
+
+# CHECK: cfc 4095(%r1)
+0xb2 0x1a 0x1f 0xff
+
+# CHECK: cfc 4095(%r15)
+0xb2 0x1a 0xff 0xff
+
# CHECK: cfdbr %r0, 0, %f0
0xb3 0x99 0x00 0x00
@@ -1582,22 +1741,22 @@
# CHECK: cfdbr %r15, 0, %f0
0xb3 0x99 0x00 0xf0
-# CHECK: cfdbra %r0, 0, %f0, 1
+# CHECK: cfdbra %r0, 0, %f0, 1
0xb3 0x99 0x01 0x00
-# CHECK: cfdbra %r0, 0, %f0, 15
+# CHECK: cfdbra %r0, 0, %f0, 15
0xb3 0x99 0x0f 0x00
-# CHECK: cfdbra %r0, 0, %f15, 1
+# CHECK: cfdbra %r0, 0, %f15, 1
0xb3 0x99 0x01 0x0f
-# CHECK: cfdbra %r0, 15, %f0, 1
+# CHECK: cfdbra %r0, 15, %f0, 1
0xb3 0x99 0xf1 0x00
-# CHECK: cfdbra %r4, 5, %f6, 7
+# CHECK: cfdbra %r4, 5, %f6, 7
0xb3 0x99 0x57 0x46
-# CHECK: cfdbra %r15, 0, %f0, 1
+# CHECK: cfdbra %r15, 0, %f0, 1
0xb3 0x99 0x01 0xf0
# CHECK: cfebr %r0, 0, %f0
@@ -1615,22 +1774,22 @@
# CHECK: cfebr %r15, 0, %f0
0xb3 0x98 0x00 0xf0
-# CHECK: cfebra %r0, 0, %f0, 1
+# CHECK: cfebra %r0, 0, %f0, 1
0xb3 0x98 0x01 0x00
-# CHECK: cfebra %r0, 0, %f0, 15
+# CHECK: cfebra %r0, 0, %f0, 15
0xb3 0x98 0x0f 0x00
-# CHECK: cfebra %r0, 0, %f15, 1
+# CHECK: cfebra %r0, 0, %f15, 1
0xb3 0x98 0x01 0x0f
-# CHECK: cfebra %r0, 15, %f0, 1
+# CHECK: cfebra %r0, 15, %f0, 1
0xb3 0x98 0xf1 0x00
-# CHECK: cfebra %r4, 5, %f6, 7
+# CHECK: cfebra %r4, 5, %f6, 7
0xb3 0x98 0x57 0x46
-# CHECK: cfebra %r15, 0, %f0, 1
+# CHECK: cfebra %r15, 0, %f0, 1
0xb3 0x98 0x01 0xf0
# CHECK: cfi %r0, -2147483648
@@ -1666,24 +1825,54 @@
# CHECK: cfxbr %r15, 0, %f0
0xb3 0x9a 0x00 0xf0
-# CHECK: cfxbra %r0, 0, %f0, 1
+# CHECK: cfxbra %r0, 0, %f0, 1
0xb3 0x9a 0x01 0x00
-# CHECK: cfxbra %r0, 0, %f0, 15
+# CHECK: cfxbra %r0, 0, %f0, 15
0xb3 0x9a 0x0f 0x00
-# CHECK: cfxbra %r0, 0, %f13, 1
+# CHECK: cfxbra %r0, 0, %f13, 1
0xb3 0x9a 0x01 0x0d
-# CHECK: cfxbra %r0, 15, %f0, 1
+# CHECK: cfxbra %r0, 15, %f0, 1
0xb3 0x9a 0xf1 0x00
-# CHECK: cfxbra %r4, 5, %f8, 9
+# CHECK: cfxbra %r4, 5, %f8, 9
0xb3 0x9a 0x59 0x48
-# CHECK: cfxbra %r15, 0, %f0, 1
+# CHECK: cfxbra %r15, 0, %f0, 1
0xb3 0x9a 0x01 0xf0
+# CHECK: cg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x20
+
+# CHECK: cg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x20
+
+# CHECK: cg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x20
+
+# CHECK: cg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x20
+
+# CHECK: cg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x20
+
+# CHECK: cg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x20
+
+# CHECK: cg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x20
+
+# CHECK: cg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x20
+
+# CHECK: cg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x20
+
+# CHECK: cg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x20
+
# CHECK: cgdbr %r0, 0, %f0
0xb3 0xa9 0x00 0x00
@@ -1699,22 +1888,22 @@
# CHECK: cgdbr %r15, 0, %f0
0xb3 0xa9 0x00 0xf0
-# CHECK: cgdbra %r0, 0, %f0, 1
+# CHECK: cgdbra %r0, 0, %f0, 1
0xb3 0xa9 0x01 0x00
-# CHECK: cgdbra %r0, 0, %f0, 15
+# CHECK: cgdbra %r0, 0, %f0, 15
0xb3 0xa9 0x0f 0x00
-# CHECK: cgdbra %r0, 0, %f15, 1
+# CHECK: cgdbra %r0, 0, %f15, 1
0xb3 0xa9 0x01 0x0f
-# CHECK: cgdbra %r0, 15, %f0, 1
+# CHECK: cgdbra %r0, 15, %f0, 1
0xb3 0xa9 0xf1 0x00
-# CHECK: cgdbra %r4, 5, %f6, 7
+# CHECK: cgdbra %r4, 5, %f6, 7
0xb3 0xa9 0x57 0x46
-# CHECK: cgdbra %r15, 0, %f0, 1
+# CHECK: cgdbra %r15, 0, %f0, 1
0xb3 0xa9 0x01 0xf0
# CHECK: cgebr %r0, 0, %f0
@@ -1732,24 +1921,54 @@
# CHECK: cgebr %r15, 0, %f0
0xb3 0xa8 0x00 0xf0
-# CHECK: cgebra %r0, 0, %f0, 1
+# CHECK: cgebra %r0, 0, %f0, 1
0xb3 0xa8 0x01 0x00
-# CHECK: cgebra %r0, 0, %f0, 15
+# CHECK: cgebra %r0, 0, %f0, 15
0xb3 0xa8 0x0f 0x00
-# CHECK: cgebra %r0, 0, %f15, 1
+# CHECK: cgebra %r0, 0, %f15, 1
0xb3 0xa8 0x01 0x0f
-# CHECK: cgebra %r0, 15, %f0, 1
+# CHECK: cgebra %r0, 15, %f0, 1
0xb3 0xa8 0xf1 0x00
-# CHECK: cgebra %r4, 5, %f6, 7
+# CHECK: cgebra %r4, 5, %f6, 7
0xb3 0xa8 0x57 0x46
-# CHECK: cgebra %r15, 0, %f0, 1
+# CHECK: cgebra %r15, 0, %f0, 1
0xb3 0xa8 0x01 0xf0
+# CHECK: cgf %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x30
+
+# CHECK: cgf %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x30
+
+# CHECK: cgf %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x30
+
+# CHECK: cgf %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x30
+
+# CHECK: cgf %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x30
+
+# CHECK: cgf %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x30
+
+# CHECK: cgf %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x30
+
+# CHECK: cgf %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x30
+
+# CHECK: cgf %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x30
+
+# CHECK: cgf %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x30
+
# CHECK: cgfi %r0, -2147483648
0xc2 0x0c 0x80 0x00 0x00 0x00
@@ -1780,35 +1999,35 @@
# CHECK: cgfr %r7, %r8
0xb9 0x30 0x00 0x78
-# CHECK: cgf %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x30
+# CHECK: cgh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x34
-# CHECK: cgf %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x30
+# CHECK: cgh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x34
-# CHECK: cgf %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x30
+# CHECK: cgh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x34
-# CHECK: cgf %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x30
+# CHECK: cgh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x34
-# CHECK: cgf %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x30
+# CHECK: cgh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x34
-# CHECK: cgf %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x30
+# CHECK: cgh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x34
-# CHECK: cgf %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x30
+# CHECK: cgh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x34
-# CHECK: cgf %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x30
+# CHECK: cgh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x34
-# CHECK: cgf %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x30
+# CHECK: cgh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x34
-# CHECK: cgf %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x30
+# CHECK: cgh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x34
# CHECK: cghi %r0, -32768
0xa7 0x0f 0x80 0x00
@@ -1861,35 +2080,95 @@
# CHECK: cghsi 4095(%r15), 42
0xe5 0x58 0xff 0xff 0x00 0x2a
-# CHECK: cgh %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x34
+# CHECK: cgib %r0, 0, 0, 0
+0xec 0x00 0x00 0x00 0x00 0xfc
-# CHECK: cgh %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x34
+# CHECK: cgib %r0, -128, 0, 0
+0xec 0x00 0x00 0x00 0x80 0xfc
-# CHECK: cgh %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x34
+# CHECK: cgib %r0, -1, 0, 0
+0xec 0x00 0x00 0x00 0xff 0xfc
-# CHECK: cgh %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x34
+# CHECK: cgib %r0, 127, 0, 0
+0xec 0x00 0x00 0x00 0x7f 0xfc
-# CHECK: cgh %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x34
+# CHECK: cgib %r15, 0, 0, 0
+0xec 0xf0 0x00 0x00 0x00 0xfc
-# CHECK: cgh %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x34
+# CHECK: cgib %r7, 100, 0, 0
+0xec 0x70 0x00 0x00 0x64 0xfc
-# CHECK: cgh %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x34
+# CHECK: cgib %r0, 0, 0, 4095(%r15)
+0xec 0x00 0xff 0xff 0x00 0xfc
-# CHECK: cgh %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x34
+# CHECK: cgib %r0, 0, 0, 0(%r8)
+0xec 0x00 0x80 0x00 0x00 0xfc
-# CHECK: cgh %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x34
+# CHECK: cgib %r0, 0, 0, 4095(%r7)
+0xec 0x00 0x7f 0xff 0x00 0xfc
-# CHECK: cgh %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x34
+# CHECK: cgib %r0, 0, 1, 0
+0xec 0x01 0x00 0x00 0x00 0xfc
+
+# CHECK: cgibh %r0, 0, 0
+0xec 0x02 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 3, 0
+0xec 0x03 0x00 0x00 0x00 0xfc
+
+# CHECK: cgibl %r0, 0, 0
+0xec 0x04 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 5, 0
+0xec 0x05 0x00 0x00 0x00 0xfc
+
+# CHECK: cgiblh %r0, 0, 0
+0xec 0x06 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 7, 0
+0xec 0x07 0x00 0x00 0x00 0xfc
+
+# CHECK: cgibe %r0, 0, 0
+0xec 0x08 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 9, 0
+0xec 0x09 0x00 0x00 0x00 0xfc
+
+# CHECK: cgibhe %r0, 0, 0
+0xec 0x0a 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 11, 0
+0xec 0x0b 0x00 0x00 0x00 0xfc
+
+# CHECK: cgible %r0, 0, 0
+0xec 0x0c 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 13, 0
+0xec 0x0d 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 14, 0
+0xec 0x0e 0x00 0x00 0x00 0xfc
+
+# CHECK: cgib %r0, 0, 15, 0
+0xec 0x0f 0x00 0x00 0x00 0xfc
+
+# CHECK: cgith %r0, 0
+0xec 0x00 0x00 0x00 0x20 0x70
+
+# CHECK: cgitl %r0, 0
+0xec 0x00 0x00 0x00 0x40 0x70
+
+# CHECK: cgite %r0, 0
+0xec 0x00 0x00 0x00 0x80 0x70
+
+# CHECK: cgitlh %r0, 0
+0xec 0x00 0x00 0x00 0x60 0x70
+
+# CHECK: cgithe %r0, 0
+0xec 0x00 0x00 0x00 0xa0 0x70
+
+# CHECK: cgitle %r0, 0
+0xec 0x00 0x00 0x00 0xc0 0x70
# CHECK: cgr %r0, %r0
0xb9 0x20 0x00 0x00
@@ -1987,126 +2266,6 @@
# CHECK: cgrtle %r0, %r1
0xb9 0x60 0xc0 0x01
-# CHECK: cg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x20
-
-# CHECK: cg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x20
-
-# CHECK: cg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x20
-
-# CHECK: cg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x20
-
-# CHECK: cg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x20
-
-# CHECK: cg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x20
-
-# CHECK: cg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x20
-
-# CHECK: cg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x20
-
-# CHECK: cg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x20
-
-# CHECK: cg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x20
-
-# CHECK: cgib %r0, 0, 0, 0
-0xec 0x00 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, -128, 0, 0
-0xec 0x00 0x00 0x00 0x80 0xfc
-
-# CHECK: cgib %r0, -1, 0, 0
-0xec 0x00 0x00 0x00 0xff 0xfc
-
-# CHECK: cgib %r0, 127, 0, 0
-0xec 0x00 0x00 0x00 0x7f 0xfc
-
-# CHECK: cgib %r15, 0, 0, 0
-0xec 0xf0 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r7, 100, 0, 0
-0xec 0x70 0x00 0x00 0x64 0xfc
-
-# CHECK: cgib %r0, 0, 0, 4095(%r15)
-0xec 0x00 0xff 0xff 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 0, 0(%r8)
-0xec 0x00 0x80 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 0, 4095(%r7)
-0xec 0x00 0x7f 0xff 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 1, 0
-0xec 0x01 0x00 0x00 0x00 0xfc
-
-# CHECK: cgibh %r0, 0, 0
-0xec 0x02 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 3, 0
-0xec 0x03 0x00 0x00 0x00 0xfc
-
-# CHECK: cgibl %r0, 0, 0
-0xec 0x04 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 5, 0
-0xec 0x05 0x00 0x00 0x00 0xfc
-
-# CHECK: cgiblh %r0, 0, 0
-0xec 0x06 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 7, 0
-0xec 0x07 0x00 0x00 0x00 0xfc
-
-# CHECK: cgibe %r0, 0, 0
-0xec 0x08 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 9, 0
-0xec 0x09 0x00 0x00 0x00 0xfc
-
-# CHECK: cgibhe %r0, 0, 0
-0xec 0x0a 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 11, 0
-0xec 0x0b 0x00 0x00 0x00 0xfc
-
-# CHECK: cgible %r0, 0, 0
-0xec 0x0c 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 13, 0
-0xec 0x0d 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 14, 0
-0xec 0x0e 0x00 0x00 0x00 0xfc
-
-# CHECK: cgib %r0, 0, 15, 0
-0xec 0x0f 0x00 0x00 0x00 0xfc
-
-# CHECK: cgith %r0, 0
-0xec 0x00 0x00 0x00 0x20 0x70
-
-# CHECK: cgitl %r0, 0
-0xec 0x00 0x00 0x00 0x40 0x70
-
-# CHECK: cgite %r0, 0
-0xec 0x00 0x00 0x00 0x80 0x70
-
-# CHECK: cgitlh %r0, 0
-0xec 0x00 0x00 0x00 0x60 0x70
-
-# CHECK: cgithe %r0, 0
-0xec 0x00 0x00 0x00 0xa0 0x70
-
-# CHECK: cgitle %r0, 0
-0xec 0x00 0x00 0x00 0xc0 0x70
-
# CHECK: cgxbr %r0, 0, %f0
0xb3 0xaa 0x00 0x00
@@ -2122,24 +2281,45 @@
# CHECK: cgxbr %r15, 0, %f0
0xb3 0xaa 0x00 0xf0
-# CHECK: cgxbra %r0, 0, %f0, 1
+# CHECK: cgxbra %r0, 0, %f0, 1
0xb3 0xaa 0x01 0x00
-# CHECK: cgxbra %r0, 0, %f0, 15
+# CHECK: cgxbra %r0, 0, %f0, 15
0xb3 0xaa 0x0f 0x00
-# CHECK: cgxbra %r0, 0, %f13, 1
+# CHECK: cgxbra %r0, 0, %f13, 1
0xb3 0xaa 0x01 0x0d
-# CHECK: cgxbra %r0, 15, %f0, 1
+# CHECK: cgxbra %r0, 15, %f0, 1
0xb3 0xaa 0xf1 0x00
-# CHECK: cgxbra %r4, 5, %f8, 9
+# CHECK: cgxbra %r4, 5, %f8, 9
0xb3 0xaa 0x59 0x48
-# CHECK: cgxbra %r15, 0, %f0, 1
+# CHECK: cgxbra %r15, 0, %f0, 1
0xb3 0xaa 0x01 0xf0
+# CHECK: ch %r0, 0
+0x49 0x00 0x00 0x00
+
+# CHECK: ch %r0, 4095
+0x49 0x00 0x0f 0xff
+
+# CHECK: ch %r0, 0(%r1)
+0x49 0x00 0x10 0x00
+
+# CHECK: ch %r0, 0(%r15)
+0x49 0x00 0xf0 0x00
+
+# CHECK: ch %r0, 4095(%r1,%r15)
+0x49 0x01 0xff 0xff
+
+# CHECK: ch %r0, 4095(%r15,%r1)
+0x49 0x0f 0x1f 0xff
+
+# CHECK: ch %r15, 0
+0x49 0xf0 0x00 0x00
+
# CHECK: chf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0xcd
@@ -2254,27 +2434,6 @@
# CHECK: chsi 4095(%r15), 42
0xe5 0x5c 0xff 0xff 0x00 0x2a
-# CHECK: ch %r0, 0
-0x49 0x00 0x00 0x00
-
-# CHECK: ch %r0, 4095
-0x49 0x00 0x0f 0xff
-
-# CHECK: ch %r0, 0(%r1)
-0x49 0x00 0x10 0x00
-
-# CHECK: ch %r0, 0(%r15)
-0x49 0x00 0xf0 0x00
-
-# CHECK: ch %r0, 4095(%r1,%r15)
-0x49 0x01 0xff 0xff
-
-# CHECK: ch %r0, 4095(%r15,%r1)
-0x49 0x0f 0x1f 0xff
-
-# CHECK: ch %r15, 0
-0x49 0xf0 0x00 0x00
-
# CHECK: chy %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x79
@@ -2413,6 +2572,39 @@
# CHECK: citle %r0, 0
0xec 0x00 0x00 0x00 0xc0 0x72
+# CHECK: cksm %r0, %r0
+0xb2 0x41 0x00 0x00
+
+# CHECK: cksm %r0, %r14
+0xb2 0x41 0x00 0x0e
+
+# CHECK: cksm %r15, %r0
+0xb2 0x41 0x00 0xf0
+
+# CHECK: cksm %r6, %r8
+0xb2 0x41 0x00 0x68
+
+# CHECK: cl %r0, 0
+0x55 0x00 0x00 0x00
+
+# CHECK: cl %r0, 4095
+0x55 0x00 0x0f 0xff
+
+# CHECK: cl %r0, 0(%r1)
+0x55 0x00 0x10 0x00
+
+# CHECK: cl %r0, 0(%r15)
+0x55 0x00 0xf0 0x00
+
+# CHECK: cl %r0, 4095(%r1,%r15)
+0x55 0x01 0xff 0xff
+
+# CHECK: cl %r0, 4095(%r15,%r1)
+0x55 0x0f 0x1f 0xff
+
+# CHECK: cl %r15, 0
+0x55 0xf0 0x00 0x00
+
# CHECK: clc 0(1), 0
0xd5 0x00 0x00 0x00 0x00 0x00
@@ -2449,94 +2641,157 @@
# CHECK: clc 0(256,%r15), 0
0xd5 0xff 0xf0 0x00 0x00 0x00
-# CHECK: clfdbr %r0, 0, %f0, 1
+# CHECK: clcl %r0, %r8
+0x0f 0x08
+
+# CHECK: clcl %r0, %r14
+0x0f 0x0e
+
+# CHECK: clcl %r14, %r0
+0x0f 0xe0
+
+# CHECK: clcl %r14, %r8
+0x0f 0xe8
+
+# CHECK: clcle %r0, %r0, 0
+0xa9 0x00 0x00 0x00
+
+# CHECK: clcle %r0, %r14, 4095
+0xa9 0x0e 0x0f 0xff
+
+# CHECK: clcle %r0, %r0, 0(%r1)
+0xa9 0x00 0x10 0x00
+
+# CHECK: clcle %r0, %r0, 0(%r15)
+0xa9 0x00 0xf0 0x00
+
+# CHECK: clcle %r0, %r14, 4095(%r15)
+0xa9 0x0e 0xff 0xff
+
+# CHECK: clcle %r0, %r0, 4095(%r1)
+0xa9 0x00 0x1f 0xff
+
+# CHECK: clcle %r14, %r0, 0
+0xa9 0xe0 0x00 0x00
+
+# CHECK: clclu %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x8f
+
+# CHECK: clclu %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x8f
+
+# CHECK: clclu %r0, %r14, 0
+0xeb 0x0e 0x00 0x00 0x00 0x8f
+
+# CHECK: clclu %r0, %r14, 1
+0xeb 0x0e 0x00 0x01 0x00 0x8f
+
+# CHECK: clclu %r0, %r8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x8f
+
+# CHECK: clclu %r0, %r8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x8f
+
+# CHECK: clclu %r0, %r4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x8f
+
+# CHECK: clclu %r0, %r4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x8f
+
+# CHECK: clclu %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x8f
+
+# CHECK: clclu %r14, %r0, 0
+0xeb 0xe0 0x00 0x00 0x00 0x8f
+
+# CHECK: clfdbr %r0, 0, %f0, 1
0xb3 0x9d 0x01 0x00
-# CHECK: clfdbr %r0, 0, %f0, 15
+# CHECK: clfdbr %r0, 0, %f0, 15
0xb3 0x9d 0x0f 0x00
-# CHECK: clfdbr %r0, 0, %f15, 1
+# CHECK: clfdbr %r0, 0, %f15, 1
0xb3 0x9d 0x01 0x0f
-# CHECK: clfdbr %r0, 15, %f0, 1
+# CHECK: clfdbr %r0, 15, %f0, 1
0xb3 0x9d 0xf1 0x00
-# CHECK: clfdbr %r4, 5, %f6, 7
+# CHECK: clfdbr %r4, 5, %f6, 7
0xb3 0x9d 0x57 0x46
-# CHECK: clfdbr %r15, 0, %f0, 1
+# CHECK: clfdbr %r15, 0, %f0, 1
0xb3 0x9d 0x01 0xf0
-# CHECK: clfebr %r0, 0, %f0, 1
+# CHECK: clfebr %r0, 0, %f0, 1
0xb3 0x9c 0x01 0x00
-# CHECK: clfebr %r0, 0, %f0, 15
+# CHECK: clfebr %r0, 0, %f0, 15
0xb3 0x9c 0x0f 0x00
-# CHECK: clfebr %r0, 0, %f15, 1
+# CHECK: clfebr %r0, 0, %f15, 1
0xb3 0x9c 0x01 0x0f
-# CHECK: clfebr %r0, 15, %f0, 1
+# CHECK: clfebr %r0, 15, %f0, 1
0xb3 0x9c 0xf1 0x00
-# CHECK: clfebr %r4, 5, %f6, 7
+# CHECK: clfebr %r4, 5, %f6, 7
0xb3 0x9c 0x57 0x46
-# CHECK: clfebr %r15, 0, %f0, 1
+# CHECK: clfebr %r15, 0, %f0, 1
0xb3 0x9c 0x01 0xf0
-# CHECK: clfxbr %r0, 0, %f0, 1
+# CHECK: clfxbr %r0, 0, %f0, 1
0xb3 0x9e 0x01 0x00
-# CHECK: clfxbr %r0, 0, %f0, 15
+# CHECK: clfxbr %r0, 0, %f0, 15
0xb3 0x9e 0x0f 0x00
-# CHECK: clfxbr %r0, 0, %f13, 1
+# CHECK: clfxbr %r0, 0, %f13, 1
0xb3 0x9e 0x01 0x0d
-# CHECK: clfxbr %r0, 15, %f0, 1
+# CHECK: clfxbr %r0, 15, %f0, 1
0xb3 0x9e 0xf1 0x00
-# CHECK: clfxbr %r4, 5, %f8, 9
+# CHECK: clfxbr %r4, 5, %f8, 9
0xb3 0x9e 0x59 0x48
-# CHECK: clfxbr %r15, 0, %f0, 1
+# CHECK: clfxbr %r15, 0, %f0, 1
0xb3 0x9e 0x01 0xf0
-# CHECK: clgdbr %r0, 0, %f0, 1
+# CHECK: clgdbr %r0, 0, %f0, 1
0xb3 0xad 0x01 0x00
-# CHECK: clgdbr %r0, 0, %f0, 15
+# CHECK: clgdbr %r0, 0, %f0, 15
0xb3 0xad 0x0f 0x00
-# CHECK: clgdbr %r0, 0, %f15, 1
+# CHECK: clgdbr %r0, 0, %f15, 1
0xb3 0xad 0x01 0x0f
-# CHECK: clgdbr %r0, 15, %f0, 1
+# CHECK: clgdbr %r0, 15, %f0, 1
0xb3 0xad 0xf1 0x00
-# CHECK: clgdbr %r4, 5, %f6, 7
+# CHECK: clgdbr %r4, 5, %f6, 7
0xb3 0xad 0x57 0x46
-# CHECK: clgdbr %r15, 0, %f0, 1
+# CHECK: clgdbr %r15, 0, %f0, 1
0xb3 0xad 0x01 0xf0
-# CHECK: clgebr %r0, 0, %f0, 1
+# CHECK: clgebr %r0, 0, %f0, 1
0xb3 0xac 0x01 0x00
-# CHECK: clgebr %r0, 0, %f0, 15
+# CHECK: clgebr %r0, 0, %f0, 15
0xb3 0xac 0x0f 0x00
-# CHECK: clgebr %r0, 0, %f15, 1
+# CHECK: clgebr %r0, 0, %f15, 1
0xb3 0xac 0x01 0x0f
-# CHECK: clgebr %r0, 15, %f0, 1
+# CHECK: clgebr %r0, 15, %f0, 1
0xb3 0xac 0xf1 0x00
-# CHECK: clgebr %r4, 5, %f6, 7
+# CHECK: clgebr %r4, 5, %f6, 7
0xb3 0xac 0x57 0x46
-# CHECK: clgebr %r15, 0, %f0, 1
+# CHECK: clgebr %r15, 0, %f0, 1
0xb3 0xac 0x01 0xf0
# CHECK: clgib %r0, 0, 0, 0
@@ -2611,22 +2866,22 @@
# CHECK: clgib %r0, 0, 15, 0
0xec 0x0f 0x00 0x00 0x00 0xfd
-# CHECK: clgxbr %r0, 0, %f0, 1
+# CHECK: clgxbr %r0, 0, %f0, 1
0xb3 0xae 0x01 0x00
-# CHECK: clgxbr %r0, 0, %f0, 15
+# CHECK: clgxbr %r0, 0, %f0, 15
0xb3 0xae 0x0f 0x00
-# CHECK: clgxbr %r0, 0, %f13, 1
+# CHECK: clgxbr %r0, 0, %f13, 1
0xb3 0xae 0x01 0x0d
-# CHECK: clgxbr %r0, 15, %f0, 1
+# CHECK: clgxbr %r0, 15, %f0, 1
0xb3 0xae 0xf1 0x00
-# CHECK: clgxbr %r4, 5, %f8, 9
+# CHECK: clgxbr %r4, 5, %f8, 9
0xb3 0xae 0x59 0x48
-# CHECK: clgxbr %r15, 0, %f0, 1
+# CHECK: clgxbr %r15, 0, %f0, 1
0xb3 0xae 0x01 0xf0
# CHECK: clfhsi 0, 0
@@ -2677,6 +2932,36 @@
# CHECK: clfitle %r0, 0
0xec 0x00 0x00 0x00 0xc0 0x73
+# CHECK: clg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x21
+
+# CHECK: clg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x21
+
+# CHECK: clg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x21
+
+# CHECK: clg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x21
+
+# CHECK: clg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x21
+
+# CHECK: clg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x21
+
+# CHECK: clg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x21
+
+# CHECK: clg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x21
+
+# CHECK: clg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x21
+
+# CHECK: clg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x21
+
# CHECK: clgith %r0, 0
0xec 0x00 0x00 0x00 0x20 0x71
@@ -2845,36 +3130,6 @@
# CHECK: clgrb %r0, %r0, 15, 0
0xec 0x00 0x00 0x00 0xf0 0xe5
-# CHECK: clg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x21
-
-# CHECK: clg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x21
-
-# CHECK: clg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x21
-
-# CHECK: clg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x21
-
-# CHECK: clg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x21
-
-# CHECK: clg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x21
-
-# CHECK: clg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x21
-
-# CHECK: clg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x21
-
-# CHECK: clg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x21
-
-# CHECK: clg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x21
-
# CHECK: clhf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0xcf
@@ -3061,6 +3316,87 @@
# CHECK: cliy 524287(%r15), 42
0xeb 0x2a 0xff 0xff 0x7f 0x55
+# CHECK: clm %r0, 0, 0
+0xbd 0x00 0x00 0x00
+
+# CHECK: clm %r0, 15, 4095
+0xbd 0x0f 0x0f 0xff
+
+# CHECK: clm %r0, 0, 0(%r1)
+0xbd 0x00 0x10 0x00
+
+# CHECK: clm %r0, 0, 0(%r15)
+0xbd 0x00 0xf0 0x00
+
+# CHECK: clm %r0, 15, 4095(%r15)
+0xbd 0x0f 0xff 0xff
+
+# CHECK: clm %r0, 0, 4095(%r1)
+0xbd 0x00 0x1f 0xff
+
+# CHECK: clm %r15, 0, 0
+0xbd 0xf0 0x00 0x00
+
+# CHECK: clmh %r0, 0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x20
+
+# CHECK: clmh %r0, 0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x20
+
+# CHECK: clmh %r0, 15, 0
+0xeb 0x0f 0x00 0x00 0x00 0x20
+
+# CHECK: clmh %r0, 15, 1
+0xeb 0x0f 0x00 0x01 0x00 0x20
+
+# CHECK: clmh %r0, 8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x20
+
+# CHECK: clmh %r0, 8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x20
+
+# CHECK: clmh %r0, 4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x20
+
+# CHECK: clmh %r0, 4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x20
+
+# CHECK: clmh %r0, 0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x20
+
+# CHECK: clmh %r15, 0, 0
+0xeb 0xf0 0x00 0x00 0x00 0x20
+
+# CHECK: clmy %r0, 0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x21
+
+# CHECK: clmy %r0, 0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x21
+
+# CHECK: clmy %r0, 15, 0
+0xeb 0x0f 0x00 0x00 0x00 0x21
+
+# CHECK: clmy %r0, 15, 1
+0xeb 0x0f 0x00 0x01 0x00 0x21
+
+# CHECK: clmy %r0, 8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x21
+
+# CHECK: clmy %r0, 8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x21
+
+# CHECK: clmy %r0, 4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x21
+
+# CHECK: clmy %r0, 4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x21
+
+# CHECK: clmy %r0, 0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x21
+
+# CHECK: clmy %r15, 0, 0
+0xeb 0xf0 0x00 0x00 0x00 0x21
+
# CHECK: clr %r0, %r0
0x15 0x00
@@ -3271,27 +3607,6 @@
# CHECK: clst %r7, %r8
0xb2 0x5d 0x00 0x78
-# CHECK: cl %r0, 0
-0x55 0x00 0x00 0x00
-
-# CHECK: cl %r0, 4095
-0x55 0x00 0x0f 0xff
-
-# CHECK: cl %r0, 0(%r1)
-0x55 0x00 0x10 0x00
-
-# CHECK: cl %r0, 0(%r15)
-0x55 0x00 0xf0 0x00
-
-# CHECK: cl %r0, 4095(%r1,%r15)
-0x55 0x01 0xff 0xff
-
-# CHECK: cl %r0, 4095(%r15,%r1)
-0x55 0x0f 0x1f 0xff
-
-# CHECK: cl %r15, 0
-0x55 0xf0 0x00 0x00
-
# CHECK: cly %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x55
@@ -3322,6 +3637,60 @@
# CHECK: cly %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x55
+# CHECK: cmpsc %r0, %r0
+0xb2 0x63 0x00 0x00
+
+# CHECK: cmpsc %r0, %r14
+0xb2 0x63 0x00 0x0e
+
+# CHECK: cmpsc %r14, %r0
+0xb2 0x63 0x00 0xe0
+
+# CHECK: cmpsc %r6, %r8
+0xb2 0x63 0x00 0x68
+
+# CHECK: cp 0(1), 0(1)
+0xf9 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: cp 0(1), 0(1,%r1)
+0xf9 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: cp 0(1), 0(1,%r15)
+0xf9 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: cp 0(1), 4095(1)
+0xf9 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: cp 0(1), 4095(1,%r1)
+0xf9 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: cp 0(1), 4095(1,%r15)
+0xf9 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: cp 0(1,%r1), 0(1)
+0xf9 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: cp 0(1,%r15), 0(1)
+0xf9 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: cp 4095(1,%r1), 0(1)
+0xf9 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: cp 4095(1,%r15), 0(1)
+0xf9 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: cp 0(16,%r1), 0(1)
+0xf9 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: cp 0(16,%r15), 0(1)
+0xf9 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: cp 0(1), 0(16,%r1)
+0xf9 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: cp 0(1), 0(16,%r15)
+0xf9 0x0f 0x00 0x00 0xf0 0x00
+
# CHECK: cpsdr %f0, %f0, %f0
0xb3 0x72 0x00 0x00
@@ -3451,6 +3820,30 @@
# CHECK: crtle %r0, %r1
0xb9 0x72 0xc0 0x01
+# CHECK: cs %r0, %r0, 0
+0xba 0x00 0x00 0x00
+
+# CHECK: cs %r0, %r0, 4095
+0xba 0x00 0x0f 0xff
+
+# CHECK: cs %r0, %r0, 0(%r1)
+0xba 0x00 0x10 0x00
+
+# CHECK: cs %r0, %r0, 0(%r15)
+0xba 0x00 0xf0 0x00
+
+# CHECK: cs %r0, %r0, 4095(%r1)
+0xba 0x00 0x1f 0xff
+
+# CHECK: cs %r0, %r0, 4095(%r15)
+0xba 0x00 0xff 0xff
+
+# CHECK: cs %r0, %r15, 0
+0xba 0x0f 0x00 0x00
+
+# CHECK: cs %r15, %r0, 0
+0xba 0xf0 0x00 0x00
+
# CHECK: csg %r0, %r0, -524288
0xeb 0x00 0x00 0x00 0x80 0x30
@@ -3484,29 +3877,26 @@
# CHECK: csg %r15, %r0, 0
0xeb 0xf0 0x00 0x00 0x00 0x30
-# CHECK: cs %r0, %r0, 0
-0xba 0x00 0x00 0x00
-
-# CHECK: cs %r0, %r0, 4095
-0xba 0x00 0x0f 0xff
+# CHECK: csst 0, 0, %r0
+0xc8 0x02 0x00 0x00 0x00 0x00
-# CHECK: cs %r0, %r0, 0(%r1)
-0xba 0x00 0x10 0x00
+# CHECK: csst 0, 4095, %r2
+0xc8 0x22 0x00 0x00 0x0f 0xff
-# CHECK: cs %r0, %r0, 0(%r15)
-0xba 0x00 0xf0 0x00
+# CHECK: csst 0, 0(%r1), %r2
+0xc8 0x22 0x00 0x00 0x10 0x00
-# CHECK: cs %r0, %r0, 4095(%r1)
-0xba 0x00 0x1f 0xff
+# CHECK: csst 0, 0(%r15), %r2
+0xc8 0x22 0x00 0x00 0xf0 0x00
-# CHECK: cs %r0, %r0, 4095(%r15)
-0xba 0x00 0xff 0xff
+# CHECK: csst 0(%r1), 4095(%r15), %r2
+0xc8 0x22 0x10 0x00 0xff 0xff
-# CHECK: cs %r0, %r15, 0
-0xba 0x0f 0x00 0x00
+# CHECK: csst 0(%r1), 0(%r15), %r2
+0xc8 0x22 0x10 0x00 0xf0 0x00
-# CHECK: cs %r15, %r0, 0
-0xba 0xf0 0x00 0x00
+# CHECK: csst 4095(%r1), 0(%r15), %r2
+0xc8 0x22 0x1f 0xff 0xf0 0x00
# CHECK: csy %r0, %r0, -524288
0xeb 0x00 0x00 0x00 0x80 0x14
@@ -3541,47 +3931,275 @@
# CHECK: csy %r15, %r0, 0
0xeb 0xf0 0x00 0x00 0x00 0x14
-# CHECK: csst 0, 0, %r0
-0xc8 0x02 0x00 0x00 0x00 0x00
+# CHECK: cu12 %r0, %r0
+0xb2 0xa7 0x00 0x00
-# CHECK: csst 0, 4095, %r2
-0xc8 0x22 0x00 0x00 0x0f 0xff
+# CHECK: cu12 %r0, %r14
+0xb2 0xa7 0x00 0x0e
-# CHECK: csst 0, 0(%r1), %r2
-0xc8 0x22 0x00 0x00 0x10 0x00
+# CHECK: cu12 %r14, %r0
+0xb2 0xa7 0x00 0xe0
-# CHECK: csst 0, 0(%r15), %r2
-0xc8 0x22 0x00 0x00 0xf0 0x00
+# CHECK: cu12 %r6, %r8
+0xb2 0xa7 0x00 0x68
-# CHECK: csst 0(%r1), 4095(%r15), %r2
-0xc8 0x22 0x10 0x00 0xff 0xff
+# CHECK: cu12 %r4, %r12, 1
+0xb2 0xa7 0x10 0x4c
-# CHECK: csst 0(%r1), 0(%r15), %r2
-0xc8 0x22 0x10 0x00 0xf0 0x00
+# CHECK: cu12 %r4, %r12, 15
+0xb2 0xa7 0xf0 0x4c
-# CHECK: csst 4095(%r1), 0(%r15), %r2
-0xc8 0x22 0x1f 0xff 0xf0 0x00
+# CHECK: cu14 %r0, %r0
+0xb9 0xb0 0x00 0x00
-# CHECK: c %r0, 0
-0x59 0x00 0x00 0x00
+# CHECK: cu14 %r0, %r14
+0xb9 0xb0 0x00 0x0e
-# CHECK: c %r0, 4095
-0x59 0x00 0x0f 0xff
+# CHECK: cu14 %r14, %r0
+0xb9 0xb0 0x00 0xe0
-# CHECK: c %r0, 0(%r1)
-0x59 0x00 0x10 0x00
+# CHECK: cu14 %r6, %r8
+0xb9 0xb0 0x00 0x68
-# CHECK: c %r0, 0(%r15)
-0x59 0x00 0xf0 0x00
+# CHECK: cu14 %r4, %r12, 1
+0xb9 0xb0 0x10 0x4c
-# CHECK: c %r0, 4095(%r1,%r15)
-0x59 0x01 0xff 0xff
+# CHECK: cu14 %r4, %r12, 15
+0xb9 0xb0 0xf0 0x4c
-# CHECK: c %r0, 4095(%r15,%r1)
-0x59 0x0f 0x1f 0xff
+# CHECK: cu21 %r0, %r0
+0xb2 0xa6 0x00 0x00
-# CHECK: c %r15, 0
-0x59 0xf0 0x00 0x00
+# CHECK: cu21 %r0, %r14
+0xb2 0xa6 0x00 0x0e
+
+# CHECK: cu21 %r14, %r0
+0xb2 0xa6 0x00 0xe0
+
+# CHECK: cu21 %r6, %r8
+0xb2 0xa6 0x00 0x68
+
+# CHECK: cu21 %r4, %r12, 1
+0xb2 0xa6 0x10 0x4c
+
+# CHECK: cu21 %r4, %r12, 15
+0xb2 0xa6 0xf0 0x4c
+
+# CHECK: cu24 %r0, %r0
+0xb9 0xb1 0x00 0x00
+
+# CHECK: cu24 %r0, %r14
+0xb9 0xb1 0x00 0x0e
+
+# CHECK: cu24 %r14, %r0
+0xb9 0xb1 0x00 0xe0
+
+# CHECK: cu24 %r6, %r8
+0xb9 0xb1 0x00 0x68
+
+# CHECK: cu24 %r4, %r12, 1
+0xb9 0xb1 0x10 0x4c
+
+# CHECK: cu24 %r4, %r12, 15
+0xb9 0xb1 0xf0 0x4c
+
+# CHECK: cu41 %r0, %r0
+0xb9 0xb2 0x00 0x00
+
+# CHECK: cu41 %r0, %r14
+0xb9 0xb2 0x00 0x0e
+
+# CHECK: cu41 %r14, %r0
+0xb9 0xb2 0x00 0xe0
+
+# CHECK: cu41 %r6, %r8
+0xb9 0xb2 0x00 0x68
+
+# CHECK: cu42 %r0, %r0
+0xb9 0xb3 0x00 0x00
+
+# CHECK: cu42 %r0, %r14
+0xb9 0xb3 0x00 0x0e
+
+# CHECK: cu42 %r14, %r0
+0xb9 0xb3 0x00 0xe0
+
+# CHECK: cu42 %r6, %r8
+0xb9 0xb3 0x00 0x68
+
+# CHECK: cuse %r0, %r0
+0xb2 0x57 0x00 0x00
+
+# CHECK: cuse %r0, %r14
+0xb2 0x57 0x00 0x0e
+
+# CHECK: cuse %r14, %r0
+0xb2 0x57 0x00 0xe0
+
+# CHECK: cuse %r6, %r8
+0xb2 0x57 0x00 0x68
+
+# CHECK: cvb %r0, 0
+0x4f 0x00 0x00 0x00
+
+# CHECK: cvb %r0, 4095
+0x4f 0x00 0x0f 0xff
+
+# CHECK: cvb %r0, 0(%r1)
+0x4f 0x00 0x10 0x00
+
+# CHECK: cvb %r0, 0(%r15)
+0x4f 0x00 0xf0 0x00
+
+# CHECK: cvb %r0, 4095(%r1,%r15)
+0x4f 0x01 0xff 0xff
+
+# CHECK: cvb %r0, 4095(%r15,%r1)
+0x4f 0x0f 0x1f 0xff
+
+# CHECK: cvb %r15, 0
+0x4f 0xf0 0x00 0x00
+
+# CHECK: cvbg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x0e
+
+# CHECK: cvbg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x0e
+
+# CHECK: cvbg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x0e
+
+# CHECK: cvbg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x0e
+
+# CHECK: cvbg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x0e
+
+# CHECK: cvbg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x0e
+
+# CHECK: cvbg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x0e
+
+# CHECK: cvbg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x0e
+
+# CHECK: cvbg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x0e
+
+# CHECK: cvbg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x0e
+
+# CHECK: cvby %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x06
+
+# CHECK: cvby %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x06
+
+# CHECK: cvby %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x06
+
+# CHECK: cvby %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x06
+
+# CHECK: cvby %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x06
+
+# CHECK: cvby %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x06
+
+# CHECK: cvby %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x06
+
+# CHECK: cvby %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x06
+
+# CHECK: cvby %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x06
+
+# CHECK: cvby %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x06
+
+# CHECK: cvd %r0, 0
+0x4e 0x00 0x00 0x00
+
+# CHECK: cvd %r0, 4095
+0x4e 0x00 0x0f 0xff
+
+# CHECK: cvd %r0, 0(%r1)
+0x4e 0x00 0x10 0x00
+
+# CHECK: cvd %r0, 0(%r15)
+0x4e 0x00 0xf0 0x00
+
+# CHECK: cvd %r0, 4095(%r1,%r15)
+0x4e 0x01 0xff 0xff
+
+# CHECK: cvd %r0, 4095(%r15,%r1)
+0x4e 0x0f 0x1f 0xff
+
+# CHECK: cvd %r15, 0
+0x4e 0xf0 0x00 0x00
+
+# CHECK: cvdg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x2e
+
+# CHECK: cvdg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x2e
+
+# CHECK: cvdg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x2e
+
+# CHECK: cvdg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x2e
+
+# CHECK: cvdg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x2e
+
+# CHECK: cvdg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x2e
+
+# CHECK: cvdg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x2e
+
+# CHECK: cvdg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x2e
+
+# CHECK: cvdg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x2e
+
+# CHECK: cvdg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x2e
+
+# CHECK: cvdy %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x26
+
+# CHECK: cvdy %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x26
+
+# CHECK: cvdy %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x26
+
+# CHECK: cvdy %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x26
+
+# CHECK: cvdy %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x26
+
+# CHECK: cvdy %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x26
+
+# CHECK: cvdy %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x26
+
+# CHECK: cvdy %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x26
+
+# CHECK: cvdy %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x26
+
+# CHECK: cvdy %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x26
# CHECK: cxbr %f0, %f0
0xb3 0x49 0x00 0x00
@@ -3610,22 +4228,22 @@
# CHECK: cxfbr %f13, %r15
0xb3 0x96 0x00 0xdf
-# CHECK: cxfbra %f0, 0, %r0, 1
+# CHECK: cxfbra %f0, 0, %r0, 1
0xb3 0x96 0x01 0x00
-# CHECK: cxfbra %f0, 0, %r0, 15
+# CHECK: cxfbra %f0, 0, %r0, 15
0xb3 0x96 0x0f 0x00
-# CHECK: cxfbra %f0, 0, %r15, 1
+# CHECK: cxfbra %f0, 0, %r15, 1
0xb3 0x96 0x01 0x0f
-# CHECK: cxfbra %f0, 15, %r0, 1
+# CHECK: cxfbra %f0, 15, %r0, 1
0xb3 0x96 0xf1 0x00
-# CHECK: cxfbra %f4, 5, %r6, 7
+# CHECK: cxfbra %f4, 5, %r6, 7
0xb3 0x96 0x57 0x46
-# CHECK: cxfbra %f13, 0, %r0, 1
+# CHECK: cxfbra %f13, 0, %r0, 1
0xb3 0x96 0x01 0xd0
# CHECK: cxgbr %f0, %r0
@@ -3643,58 +4261,58 @@
# CHECK: cxgbr %f13, %r15
0xb3 0xa6 0x00 0xdf
-# CHECK: cxgbra %f0, 0, %r0, 1
+# CHECK: cxgbra %f0, 0, %r0, 1
0xb3 0xa6 0x01 0x00
-# CHECK: cxgbra %f0, 0, %r0, 15
+# CHECK: cxgbra %f0, 0, %r0, 15
0xb3 0xa6 0x0f 0x00
-# CHECK: cxgbra %f0, 0, %r15, 1
+# CHECK: cxgbra %f0, 0, %r15, 1
0xb3 0xa6 0x01 0x0f
-# CHECK: cxgbra %f0, 15, %r0, 1
+# CHECK: cxgbra %f0, 15, %r0, 1
0xb3 0xa6 0xf1 0x00
-# CHECK: cxgbra %f4, 5, %r6, 7
+# CHECK: cxgbra %f4, 5, %r6, 7
0xb3 0xa6 0x57 0x46
-# CHECK: cxgbra %f13, 0, %r0, 1
+# CHECK: cxgbra %f13, 0, %r0, 1
0xb3 0xa6 0x01 0xd0
-# CHECK: cxlfbr %f0, 0, %r0, 1
+# CHECK: cxlfbr %f0, 0, %r0, 1
0xb3 0x92 0x01 0x00
-# CHECK: cxlfbr %f0, 0, %r0, 15
+# CHECK: cxlfbr %f0, 0, %r0, 15
0xb3 0x92 0x0f 0x00
-# CHECK: cxlfbr %f0, 0, %r15, 1
+# CHECK: cxlfbr %f0, 0, %r15, 1
0xb3 0x92 0x01 0x0f
-# CHECK: cxlfbr %f0, 15, %r0, 1
+# CHECK: cxlfbr %f0, 15, %r0, 1
0xb3 0x92 0xf1 0x00
-# CHECK: cxlfbr %f4, 5, %r6, 7
+# CHECK: cxlfbr %f4, 5, %r6, 7
0xb3 0x92 0x57 0x46
-# CHECK: cxlfbr %f13, 0, %r0, 1
+# CHECK: cxlfbr %f13, 0, %r0, 1
0xb3 0x92 0x01 0xd0
-# CHECK: cxlgbr %f0, 0, %r0, 1
+# CHECK: cxlgbr %f0, 0, %r0, 1
0xb3 0xa2 0x01 0x00
-# CHECK: cxlgbr %f0, 0, %r0, 15
+# CHECK: cxlgbr %f0, 0, %r0, 15
0xb3 0xa2 0x0f 0x00
-# CHECK: cxlgbr %f0, 0, %r15, 1
+# CHECK: cxlgbr %f0, 0, %r15, 1
0xb3 0xa2 0x01 0x0f
-# CHECK: cxlgbr %f0, 15, %r0, 1
+# CHECK: cxlgbr %f0, 15, %r0, 1
0xb3 0xa2 0xf1 0x00
-# CHECK: cxlgbr %f4, 5, %r6, 7
+# CHECK: cxlgbr %f4, 5, %r6, 7
0xb3 0xa2 0x57 0x46
-# CHECK: cxlgbr %f13, 0, %r0, 1
+# CHECK: cxlgbr %f13, 0, %r0, 1
0xb3 0xa2 0x01 0xd0
# CHECK: cy %r0, -524288
@@ -3727,17 +4345,26 @@
# CHECK: cy %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x59
-# CHECK: ddbr %f0, %f0
-0xb3 0x1d 0x00 0x00
+# CHECK: d %r0, 0
+0x5d 0x00 0x00 0x00
-# CHECK: ddbr %f0, %f15
-0xb3 0x1d 0x00 0x0f
+# CHECK: d %r0, 4095
+0x5d 0x00 0x0f 0xff
-# CHECK: ddbr %f7, %f8
-0xb3 0x1d 0x00 0x78
+# CHECK: d %r0, 0(%r1)
+0x5d 0x00 0x10 0x00
-# CHECK: ddbr %f15, %f0
-0xb3 0x1d 0x00 0xf0
+# CHECK: d %r0, 0(%r15)
+0x5d 0x00 0xf0 0x00
+
+# CHECK: d %r0, 4095(%r1,%r15)
+0x5d 0x01 0xff 0xff
+
+# CHECK: d %r0, 4095(%r15,%r1)
+0x5d 0x0f 0x1f 0xff
+
+# CHECK: d %r14, 0
+0x5d 0xe0 0x00 0x00
# CHECK: ddb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x1d
@@ -3760,17 +4387,17 @@
# CHECK: ddb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x1d
-# CHECK: debr %f0, %f0
-0xb3 0x0d 0x00 0x00
+# CHECK: ddbr %f0, %f0
+0xb3 0x1d 0x00 0x00
-# CHECK: debr %f0, %f15
-0xb3 0x0d 0x00 0x0f
+# CHECK: ddbr %f0, %f15
+0xb3 0x1d 0x00 0x0f
-# CHECK: debr %f7, %f8
-0xb3 0x0d 0x00 0x78
+# CHECK: ddbr %f7, %f8
+0xb3 0x1d 0x00 0x78
-# CHECK: debr %f15, %f0
-0xb3 0x0d 0x00 0xf0
+# CHECK: ddbr %f15, %f0
+0xb3 0x1d 0x00 0xf0
# CHECK: deb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x0d
@@ -3793,17 +4420,83 @@
# CHECK: deb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x0d
-# CHECK: dlgr %r0, %r0
-0xb9 0x87 0x00 0x00
+# CHECK: debr %f0, %f0
+0xb3 0x0d 0x00 0x00
-# CHECK: dlgr %r0, %r15
-0xb9 0x87 0x00 0x0f
+# CHECK: debr %f0, %f15
+0xb3 0x0d 0x00 0x0f
-# CHECK: dlgr %r14, %r0
-0xb9 0x87 0x00 0xe0
+# CHECK: debr %f7, %f8
+0xb3 0x0d 0x00 0x78
-# CHECK: dlgr %r6, %r9
-0xb9 0x87 0x00 0x69
+# CHECK: debr %f15, %f0
+0xb3 0x0d 0x00 0xf0
+
+# CHECK: didbr %f0, %f0, %f0, 1
+0xb3 0x5b 0x01 0x00
+
+# CHECK: didbr %f0, %f0, %f0, 15
+0xb3 0x5b 0x0f 0x00
+
+# CHECK: didbr %f0, %f0, %f15, 1
+0xb3 0x5b 0x01 0x0f
+
+# CHECK: didbr %f0, %f15, %f0, 1
+0xb3 0x5b 0xf1 0x00
+
+# CHECK: didbr %f4, %f5, %f6, 7
+0xb3 0x5b 0x57 0x46
+
+# CHECK: didbr %f15, %f0, %f0, 1
+0xb3 0x5b 0x01 0xf0
+
+# CHECK: diebr %f0, %f0, %f0, 1
+0xb3 0x53 0x01 0x00
+
+# CHECK: diebr %f0, %f0, %f0, 15
+0xb3 0x53 0x0f 0x00
+
+# CHECK: diebr %f0, %f0, %f15, 1
+0xb3 0x53 0x01 0x0f
+
+# CHECK: diebr %f0, %f15, %f0, 1
+0xb3 0x53 0xf1 0x00
+
+# CHECK: diebr %f4, %f5, %f6, 7
+0xb3 0x53 0x57 0x46
+
+# CHECK: diebr %f15, %f0, %f0, 1
+0xb3 0x53 0x01 0xf0
+
+# CHECK: dl %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x97
+
+# CHECK: dl %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x97
+
+# CHECK: dl %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x97
+
+# CHECK: dl %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x97
+
+# CHECK: dl %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x97
+
+# CHECK: dl %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x97
+
+# CHECK: dl %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x97
+
+# CHECK: dl %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x97
+
+# CHECK: dl %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x97
+
+# CHECK: dl %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x97
# CHECK: dlg %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x87
@@ -3835,6 +4528,18 @@
# CHECK: dlg %r14, 0
0xe3 0xe0 0x00 0x00 0x00 0x87
+# CHECK: dlgr %r0, %r0
+0xb9 0x87 0x00 0x00
+
+# CHECK: dlgr %r0, %r15
+0xb9 0x87 0x00 0x0f
+
+# CHECK: dlgr %r14, %r0
+0xb9 0x87 0x00 0xe0
+
+# CHECK: dlgr %r6, %r9
+0xb9 0x87 0x00 0x69
+
# CHECK: dlr %r0, %r0
0xb9 0x97 0x00 0x00
@@ -3847,47 +4552,89 @@
# CHECK: dlr %r6, %r9
0xb9 0x97 0x00 0x69
-# CHECK: dl %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x97
+# CHECK: dp 0(1), 0(1)
+0xfd 0x00 0x00 0x00 0x00 0x00
-# CHECK: dl %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x97
+# CHECK: dp 0(1), 0(1,%r1)
+0xfd 0x00 0x00 0x00 0x10 0x00
-# CHECK: dl %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x97
+# CHECK: dp 0(1), 0(1,%r15)
+0xfd 0x00 0x00 0x00 0xf0 0x00
-# CHECK: dl %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x97
+# CHECK: dp 0(1), 4095(1)
+0xfd 0x00 0x00 0x00 0x0f 0xff
-# CHECK: dl %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x97
+# CHECK: dp 0(1), 4095(1,%r1)
+0xfd 0x00 0x00 0x00 0x1f 0xff
-# CHECK: dl %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x97
+# CHECK: dp 0(1), 4095(1,%r15)
+0xfd 0x00 0x00 0x00 0xff 0xff
-# CHECK: dl %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x97
+# CHECK: dp 0(1,%r1), 0(1)
+0xfd 0x00 0x10 0x00 0x00 0x00
-# CHECK: dl %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x97
+# CHECK: dp 0(1,%r15), 0(1)
+0xfd 0x00 0xf0 0x00 0x00 0x00
-# CHECK: dl %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x97
+# CHECK: dp 4095(1,%r1), 0(1)
+0xfd 0x00 0x1f 0xff 0x00 0x00
-# CHECK: dl %r14, 0
-0xe3 0xe0 0x00 0x00 0x00 0x97
+# CHECK: dp 4095(1,%r15), 0(1)
+0xfd 0x00 0xff 0xff 0x00 0x00
-# CHECK: dsgfr %r0, %r0
-0xb9 0x1d 0x00 0x00
+# CHECK: dp 0(16,%r1), 0(1)
+0xfd 0xf0 0x10 0x00 0x00 0x00
-# CHECK: dsgfr %r0, %r15
-0xb9 0x1d 0x00 0x0f
+# CHECK: dp 0(16,%r15), 0(1)
+0xfd 0xf0 0xf0 0x00 0x00 0x00
-# CHECK: dsgfr %r14, %r0
-0xb9 0x1d 0x00 0xe0
+# CHECK: dp 0(1), 0(16,%r1)
+0xfd 0x0f 0x00 0x00 0x10 0x00
-# CHECK: dsgfr %r6, %r9
-0xb9 0x1d 0x00 0x69
+# CHECK: dp 0(1), 0(16,%r15)
+0xfd 0x0f 0x00 0x00 0xf0 0x00
+
+# CHECK: dr %r0, %r0
+0x1d 0x00
+
+# CHECK: dr %r0, %r15
+0x1d 0x0f
+
+# CHECK: dr %r14, %r0
+0x1d 0xe0
+
+# CHECK: dr %r6, %r9
+0x1d 0x69
+
+# CHECK: dsg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x0d
+
+# CHECK: dsg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x0d
+
+# CHECK: dsg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x0d
+
+# CHECK: dsg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x0d
+
+# CHECK: dsg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x0d
+
+# CHECK: dsg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x0d
+
+# CHECK: dsg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x0d
+
+# CHECK: dsg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x0d
+
+# CHECK: dsg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x0d
+
+# CHECK: dsg %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x0d
# CHECK: dsgf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x1d
@@ -3919,6 +4666,18 @@
# CHECK: dsgf %r14, 0
0xe3 0xe0 0x00 0x00 0x00 0x1d
+# CHECK: dsgfr %r0, %r0
+0xb9 0x1d 0x00 0x00
+
+# CHECK: dsgfr %r0, %r15
+0xb9 0x1d 0x00 0x0f
+
+# CHECK: dsgfr %r14, %r0
+0xb9 0x1d 0x00 0xe0
+
+# CHECK: dsgfr %r6, %r9
+0xb9 0x1d 0x00 0x69
+
# CHECK: dsgr %r0, %r0
0xb9 0x0d 0x00 0x00
@@ -3931,36 +4690,6 @@
# CHECK: dsgr %r6, %r9
0xb9 0x0d 0x00 0x69
-# CHECK: dsg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x0d
-
-# CHECK: dsg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x0d
-
-# CHECK: dsg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x0d
-
-# CHECK: dsg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x0d
-
-# CHECK: dsg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x0d
-
-# CHECK: dsg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x0d
-
-# CHECK: dsg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x0d
-
-# CHECK: dsg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x0d
-
-# CHECK: dsg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x0d
-
-# CHECK: dsg %r14, 0
-0xe3 0xe0 0x00 0x00 0x00 0x0d
-
# CHECK: dxbr %f0, %f0
0xb3 0x4d 0x00 0x00
@@ -3988,6 +4717,45 @@
# CHECK: ear %r15, %a15
0xb2 0x4f 0x00 0xff
+# CHECK: ecag %r0, %r0, 0
+0xeb 0x00 0x00 0x00 0x00 0x4c
+
+# CHECK: ecag %r0, %r15, 0
+0xeb 0x0f 0x00 0x00 0x00 0x4c
+
+# CHECK: ecag %r14, %r15, 0
+0xeb 0xef 0x00 0x00 0x00 0x4c
+
+# CHECK: ecag %r15, %r15, 0
+0xeb 0xff 0x00 0x00 0x00 0x4c
+
+# CHECK: ecag %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x4c
+
+# CHECK: ecag %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x4c
+
+# CHECK: ecag %r0, %r0, 0
+0xeb 0x00 0x00 0x00 0x00 0x4c
+
+# CHECK: ecag %r0, %r0, 1
+0xeb 0x00 0x00 0x01 0x00 0x4c
+
+# CHECK: ecag %r0, %r0, 524287
+0xeb 0x00 0x0f 0xff 0x7f 0x4c
+
+# CHECK: ecag %r0, %r0, 0(%r1)
+0xeb 0x00 0x10 0x00 0x00 0x4c
+
+# CHECK: ecag %r0, %r0, 0(%r15)
+0xeb 0x00 0xf0 0x00 0x00 0x4c
+
+# CHECK: ecag %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x4c
+
+# CHECK: ecag %r0, %r0, 524287(%r15)
+0xeb 0x00 0xff 0xff 0x7f 0x4c
+
# CHECK: ectg 0, 0, %r0
0xc8 0x01 0x00 0x00 0x00 0x00
@@ -4009,6 +4777,78 @@
# CHECK: ectg 4095(%r1), 0(%r15), %r2
0xc8 0x21 0x1f 0xff 0xf0 0x00
+# CHECK: ed 0(1), 0
+0xde 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: ed 0(1), 0(%r1)
+0xde 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: ed 0(1), 0(%r15)
+0xde 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: ed 0(1), 4095
+0xde 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: ed 0(1), 4095(%r1)
+0xde 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: ed 0(1), 4095(%r15)
+0xde 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: ed 0(1,%r1), 0
+0xde 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: ed 0(1,%r15), 0
+0xde 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: ed 4095(1,%r1), 0
+0xde 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: ed 4095(1,%r15), 0
+0xde 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: ed 0(256,%r1), 0
+0xde 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: ed 0(256,%r15), 0
+0xde 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: edmk 0(1), 0
+0xdf 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: edmk 0(1), 0(%r1)
+0xdf 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: edmk 0(1), 0(%r15)
+0xdf 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: edmk 0(1), 4095
+0xdf 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: edmk 0(1), 4095(%r1)
+0xdf 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: edmk 0(1), 4095(%r15)
+0xdf 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: edmk 0(1,%r1), 0
+0xdf 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: edmk 0(1,%r15), 0
+0xdf 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: edmk 4095(1,%r1), 0
+0xdf 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: edmk 4095(1,%r15), 0
+0xdf 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: edmk 0(256,%r1), 0
+0xdf 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: edmk 0(256,%r15), 0
+0xdf 0xff 0xf0 0x00 0x00 0x00
+
# CHECK: efpc %r0
0xb3 0x8c 0x00 0x00
@@ -4018,6 +4858,18 @@
# CHECK: efpc %r15
0xb3 0x8c 0x00 0xf0
+# CHECK: epsw %r0, %r0
+0xb9 0x8d 0x00 0x00
+
+# CHECK: epsw %r0, %r15
+0xb9 0x8d 0x00 0x0f
+
+# CHECK: epsw %r15, %r0
+0xb9 0x8d 0x00 0xf0
+
+# CHECK: epsw %r6, %r8
+0xb9 0x8d 0x00 0x68
+
# CHECK: etnd %r0
0xb2 0xec 0x00 0x00
@@ -4063,22 +4915,22 @@
# CHECK: fidbr %f15, 0, %f0
0xb3 0x5f 0x00 0xf0
-# CHECK: fidbra %f0, 0, %f0, 1
+# CHECK: fidbra %f0, 0, %f0, 1
0xb3 0x5f 0x01 0x00
-# CHECK: fidbra %f0, 0, %f0, 15
+# CHECK: fidbra %f0, 0, %f0, 15
0xb3 0x5f 0x0f 0x00
-# CHECK: fidbra %f0, 0, %f15, 1
+# CHECK: fidbra %f0, 0, %f15, 1
0xb3 0x5f 0x01 0x0f
-# CHECK: fidbra %f0, 15, %f0, 1
+# CHECK: fidbra %f0, 15, %f0, 1
0xb3 0x5f 0xf1 0x00
-# CHECK: fidbra %f4, 5, %f6, 7
+# CHECK: fidbra %f4, 5, %f6, 7
0xb3 0x5f 0x57 0x46
-# CHECK: fidbra %f15, 0, %f0, 1
+# CHECK: fidbra %f15, 0, %f0, 1
0xb3 0x5f 0x01 0xf0
# CHECK: fiebr %f0, 0, %f0
@@ -4096,22 +4948,22 @@
# CHECK: fiebr %f15, 0, %f0
0xb3 0x57 0x00 0xf0
-# CHECK: fiebra %f0, 0, %f0, 1
+# CHECK: fiebra %f0, 0, %f0, 1
0xb3 0x57 0x01 0x00
-# CHECK: fiebra %f0, 0, %f0, 15
+# CHECK: fiebra %f0, 0, %f0, 15
0xb3 0x57 0x0f 0x00
-# CHECK: fiebra %f0, 0, %f15, 1
+# CHECK: fiebra %f0, 0, %f15, 1
0xb3 0x57 0x01 0x0f
-# CHECK: fiebra %f0, 15, %f0, 1
+# CHECK: fiebra %f0, 15, %f0, 1
0xb3 0x57 0xf1 0x00
-# CHECK: fiebra %f4, 5, %f6, 7
+# CHECK: fiebra %f4, 5, %f6, 7
0xb3 0x57 0x57 0x46
-# CHECK: fiebra %f15, 0, %f0, 1
+# CHECK: fiebra %f15, 0, %f0, 1
0xb3 0x57 0x01 0xf0
# CHECK: fixbr %f0, 0, %f0
@@ -4129,22 +4981,22 @@
# CHECK: fixbr %f13, 0, %f0
0xb3 0x47 0x00 0xd0
-# CHECK: fixbra %f0, 0, %f0, 1
+# CHECK: fixbra %f0, 0, %f0, 1
0xb3 0x47 0x01 0x00
-# CHECK: fixbra %f0, 0, %f0, 15
+# CHECK: fixbra %f0, 0, %f0, 15
0xb3 0x47 0x0f 0x00
-# CHECK: fixbra %f0, 0, %f13, 1
+# CHECK: fixbra %f0, 0, %f13, 1
0xb3 0x47 0x01 0x0d
-# CHECK: fixbra %f0, 15, %f0, 1
+# CHECK: fixbra %f0, 15, %f0, 1
0xb3 0x47 0xf1 0x00
-# CHECK: fixbra %f4, 5, %f8, 9
+# CHECK: fixbra %f4, 5, %f8, 9
0xb3 0x47 0x59 0x48
-# CHECK: fixbra %f13, 0, %f0, 1
+# CHECK: fixbra %f13, 0, %f0, 1
0xb3 0x47 0x01 0xd0
# CHECK: flogr %r0, %r0
@@ -4366,6 +5218,201 @@
# CHECK: ipm %r15
0xb2 0x22 0x00 0xf0
+# CHECK: kdb %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x18
+
+# CHECK: kdb %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x18
+
+# CHECK: kdb %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x18
+
+# CHECK: kdb %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x18
+
+# CHECK: kdb %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x18
+
+# CHECK: kdb %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x18
+
+# CHECK: kdb %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x18
+
+# CHECK: kdbr %f0, %f0
+0xb3 0x18 0x00 0x00
+
+# CHECK: kdbr %f0, %f15
+0xb3 0x18 0x00 0x0f
+
+# CHECK: kdbr %f7, %f8
+0xb3 0x18 0x00 0x78
+
+# CHECK: kdbr %f15, %f0
+0xb3 0x18 0x00 0xf0
+
+# CHECK: keb %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x08
+
+# CHECK: keb %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x08
+
+# CHECK: keb %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x08
+
+# CHECK: keb %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x08
+
+# CHECK: keb %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x08
+
+# CHECK: keb %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x08
+
+# CHECK: keb %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x08
+
+# CHECK: kebr %f0, %f0
+0xb3 0x08 0x00 0x00
+
+# CHECK: kebr %f0, %f15
+0xb3 0x08 0x00 0x0f
+
+# CHECK: kebr %f7, %f8
+0xb3 0x08 0x00 0x78
+
+# CHECK: kebr %f15, %f0
+0xb3 0x08 0x00 0xf0
+
+# CHECK: kimd %r2, %r10
+0xb9 0x3e 0x00 0x2a
+
+# CHECK: kimd %r2, %r14
+0xb9 0x3e 0x00 0x2e
+
+# CHECK: kimd %r14, %r2
+0xb9 0x3e 0x00 0xe2
+
+# CHECK: kimd %r14, %r10
+0xb9 0x3e 0x00 0xea
+
+# CHECK: klmd %r2, %r10
+0xb9 0x3f 0x00 0x2a
+
+# CHECK: klmd %r2, %r14
+0xb9 0x3f 0x00 0x2e
+
+# CHECK: klmd %r14, %r2
+0xb9 0x3f 0x00 0xe2
+
+# CHECK: klmd %r14, %r10
+0xb9 0x3f 0x00 0xea
+
+# CHECK: km %r2, %r10
+0xb9 0x2e 0x00 0x2a
+
+# CHECK: km %r2, %r14
+0xb9 0x2e 0x00 0x2e
+
+# CHECK: km %r14, %r2
+0xb9 0x2e 0x00 0xe2
+
+# CHECK: km %r14, %r10
+0xb9 0x2e 0x00 0xea
+
+# CHECK: kmac %r2, %r10
+0xb9 0x1e 0x00 0x2a
+
+# CHECK: kmac %r2, %r14
+0xb9 0x1e 0x00 0x2e
+
+# CHECK: kmac %r14, %r2
+0xb9 0x1e 0x00 0xe2
+
+# CHECK: kmac %r14, %r10
+0xb9 0x1e 0x00 0xea
+
+# CHECK: kmc %r2, %r10
+0xb9 0x2f 0x00 0x2a
+
+# CHECK: kmc %r2, %r14
+0xb9 0x2f 0x00 0x2e
+
+# CHECK: kmc %r14, %r2
+0xb9 0x2f 0x00 0xe2
+
+# CHECK: kmc %r14, %r10
+0xb9 0x2f 0x00 0xea
+
+# CHECK: kmctr %r2, %r4, %r10
+0xb9 0x2d 0x40 0x2a
+
+# CHECK: kmctr %r2, %r6, %r14
+0xb9 0x2d 0x60 0x2e
+
+# CHECK: kmctr %r14, %r8, %r2
+0xb9 0x2d 0x80 0xe2
+
+# CHECK: kmctr %r14, %r12, %r10
+0xb9 0x2d 0xc0 0xea
+
+# CHECK: kmf %r2, %r10
+0xb9 0x2a 0x00 0x2a
+
+# CHECK: kmf %r2, %r14
+0xb9 0x2a 0x00 0x2e
+
+# CHECK: kmf %r14, %r2
+0xb9 0x2a 0x00 0xe2
+
+# CHECK: kmf %r14, %r10
+0xb9 0x2a 0x00 0xea
+
+# CHECK: kmo %r2, %r10
+0xb9 0x2b 0x00 0x2a
+
+# CHECK: kmo %r2, %r14
+0xb9 0x2b 0x00 0x2e
+
+# CHECK: kmo %r14, %r2
+0xb9 0x2b 0x00 0xe2
+
+# CHECK: kmo %r14, %r10
+0xb9 0x2b 0x00 0xea
+
+# CHECK: kxbr %f0, %f0
+0xb3 0x48 0x00 0x00
+
+# CHECK: kxbr %f0, %f13
+0xb3 0x48 0x00 0x0d
+
+# CHECK: kxbr %f8, %f8
+0xb3 0x48 0x00 0x88
+
+# CHECK: kxbr %f13, %f0
+0xb3 0x48 0x00 0xd0
+
+# CHECK: l %r0, 0
+0x58 0x00 0x00 0x00
+
+# CHECK: l %r0, 4095
+0x58 0x00 0x0f 0xff
+
+# CHECK: l %r0, 0(%r1)
+0x58 0x00 0x10 0x00
+
+# CHECK: l %r0, 0(%r15)
+0x58 0x00 0xf0 0x00
+
+# CHECK: l %r0, 4095(%r1,%r15)
+0x58 0x01 0xff 0xff
+
+# CHECK: l %r0, 4095(%r15,%r1)
+0x58 0x0f 0x1f 0xff
+
+# CHECK: l %r15, 0
+0x58 0xf0 0x00 0x00
+
# CHECK: la %r0, 0
0x41 0x00 0x00 0x00
@@ -4672,8 +5719,8 @@
# CHECK: lan %r15, %r0, 0
0xeb 0xf0 0x00 0x00 0x00 0xf4
-# CHECK: csy %r0, %r0, -524288
-0xeb 0x00 0x00 0x00 0x80 0x14
+# CHECK: lang %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0xe4
# CHECK: lang %r0, %r0, -1
0xeb 0x00 0x0f 0xff 0xff 0xe4
@@ -4771,6 +5818,36 @@
# CHECK: laog %r15, %r0, 0
0xeb 0xf0 0x00 0x00 0x00 0xe6
+# CHECK: lat %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x9f
+
+# CHECK: lat %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x9f
+
+# CHECK: lat %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x9f
+
+# CHECK: lat %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x9f
+
+# CHECK: lat %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x9f
+
+# CHECK: lat %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x9f
+
+# CHECK: lat %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x9f
+
+# CHECK: lat %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x9f
+
+# CHECK: lat %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x9f
+
+# CHECK: lat %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x9f
+
# CHECK: lax %r0, %r0, -524288
0xeb 0x00 0x00 0x00 0x80 0xf7
@@ -4867,15 +5944,6 @@
# CHECK: lay %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x71
-# CHECK: lbr %r0, %r15
-0xb9 0x26 0x00 0x0f
-
-# CHECK: lbr %r7, %r8
-0xb9 0x26 0x00 0x78
-
-# CHECK: lbr %r15, %r0
-0xb9 0x26 0x00 0xf0
-
# CHECK: lb %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x76
@@ -4936,6 +6004,15 @@
# CHECK: lbh %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0xc0
+# CHECK: lbr %r0, %r15
+0xb9 0x26 0x00 0x0f
+
+# CHECK: lbr %r7, %r8
+0xb9 0x26 0x00 0x78
+
+# CHECK: lbr %r15, %r0
+0xb9 0x26 0x00 0xf0
+
# CHECK: lcdbr %f0, %f9
0xb3 0x13 0x00 0x09
@@ -5008,14 +6085,26 @@
# CHECK: lcxbr %f13, %f9
0xb3 0x43 0x00 0xd9
-# CHECK: ldebr %f0, %f15
-0xb3 0x04 0x00 0x0f
+# CHECK: ld %f0, 0
+0x68 0x00 0x00 0x00
-# CHECK: ldebr %f7, %f8
-0xb3 0x04 0x00 0x78
+# CHECK: ld %f0, 4095
+0x68 0x00 0x0f 0xff
-# CHECK: ldebr %f15, %f0
-0xb3 0x04 0x00 0xf0
+# CHECK: ld %f0, 0(%r1)
+0x68 0x00 0x10 0x00
+
+# CHECK: ld %f0, 0(%r15)
+0x68 0x00 0xf0 0x00
+
+# CHECK: ld %f0, 4095(%r1,%r15)
+0x68 0x01 0xff 0xff
+
+# CHECK: ld %f0, 4095(%r15,%r1)
+0x68 0x0f 0x1f 0xff
+
+# CHECK: ld %f15, 0
+0x68 0xf0 0x00 0x00
# CHECK: ldeb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x04
@@ -5038,6 +6127,15 @@
# CHECK: ldeb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x04
+# CHECK: ldebr %f0, %f15
+0xb3 0x04 0x00 0x0f
+
+# CHECK: ldebr %f7, %f8
+0xb3 0x04 0x00 0x78
+
+# CHECK: ldebr %f15, %f0
+0xb3 0x04 0x00 0xf0
+
# CHECK: ldgr %f0, %r0
0xb3 0xc1 0x00 0x00
@@ -5065,27 +6163,6 @@
# CHECK: ldr %f15, %f9
0x28 0xf9
-# CHECK: ld %f0, 0
-0x68 0x00 0x00 0x00
-
-# CHECK: ld %f0, 4095
-0x68 0x00 0x0f 0xff
-
-# CHECK: ld %f0, 0(%r1)
-0x68 0x00 0x10 0x00
-
-# CHECK: ld %f0, 0(%r15)
-0x68 0x00 0xf0 0x00
-
-# CHECK: ld %f0, 4095(%r1,%r15)
-0x68 0x01 0xff 0xff
-
-# CHECK: ld %f0, 4095(%r15,%r1)
-0x68 0x0f 0x1f 0xff
-
-# CHECK: ld %f15, 0
-0x68 0xf0 0x00 0x00
-
# CHECK: ldxbr %f0, %f0
0xb3 0x45 0x00 0x00
@@ -5101,22 +6178,22 @@
# CHECK: ldxbr %f13, %f13
0xb3 0x45 0x00 0xdd
-# CHECK: ldxbra %f0, 0, %f0, 1
+# CHECK: ldxbra %f0, 0, %f0, 1
0xb3 0x45 0x01 0x00
-# CHECK: ldxbra %f0, 0, %f0, 15
+# CHECK: ldxbra %f0, 0, %f0, 15
0xb3 0x45 0x0f 0x00
-# CHECK: ldxbra %f0, 0, %f13, 1
+# CHECK: ldxbra %f0, 0, %f13, 1
0xb3 0x45 0x01 0x0d
-# CHECK: ldxbra %f0, 15, %f0, 1
+# CHECK: ldxbra %f0, 15, %f0, 1
0xb3 0x45 0xf1 0x00
-# CHECK: ldxbra %f4, 5, %f8, 9
+# CHECK: ldxbra %f4, 5, %f8, 9
0xb3 0x45 0x59 0x48
-# CHECK: ldxbra %f13, 0, %f0, 1
+# CHECK: ldxbra %f13, 0, %f0, 1
0xb3 0x45 0x01 0xd0
# CHECK: ldy %f0, -524288
@@ -5149,6 +6226,27 @@
# CHECK: ldy %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x65
+# CHECK: le %f0, 0
+0x78 0x00 0x00 0x00
+
+# CHECK: le %f0, 4095
+0x78 0x00 0x0f 0xff
+
+# CHECK: le %f0, 0(%r1)
+0x78 0x00 0x10 0x00
+
+# CHECK: le %f0, 0(%r15)
+0x78 0x00 0xf0 0x00
+
+# CHECK: le %f0, 4095(%r1,%r15)
+0x78 0x01 0xff 0xff
+
+# CHECK: le %f0, 4095(%r15,%r1)
+0x78 0x0f 0x1f 0xff
+
+# CHECK: le %f15, 0
+0x78 0xf0 0x00 0x00
+
# CHECK: ledbr %f0, %f0
0xb3 0x44 0x00 0x00
@@ -5164,22 +6262,22 @@
# CHECK: ledbr %f15, %f15
0xb3 0x44 0x00 0xff
-# CHECK: ledbra %f0, 0, %f0, 1
+# CHECK: ledbra %f0, 0, %f0, 1
0xb3 0x44 0x01 0x00
-# CHECK: ledbra %f0, 0, %f0, 15
+# CHECK: ledbra %f0, 0, %f0, 15
0xb3 0x44 0x0f 0x00
-# CHECK: ledbra %f0, 0, %f15, 1
+# CHECK: ledbra %f0, 0, %f15, 1
0xb3 0x44 0x01 0x0f
-# CHECK: ledbra %f0, 15, %f0, 1
+# CHECK: ledbra %f0, 15, %f0, 1
0xb3 0x44 0xf1 0x00
-# CHECK: ledbra %f4, 5, %f6, 7
+# CHECK: ledbra %f4, 5, %f6, 7
0xb3 0x44 0x57 0x46
-# CHECK: ledbra %f15, 0, %f0, 1
+# CHECK: ledbra %f15, 0, %f0, 1
0xb3 0x44 0x01 0xf0
# CHECK: ler %f0, %f9
@@ -5194,27 +6292,6 @@
# CHECK: ler %f15, %f9
0x38 0xf9
-# CHECK: le %f0, 0
-0x78 0x00 0x00 0x00
-
-# CHECK: le %f0, 4095
-0x78 0x00 0x0f 0xff
-
-# CHECK: le %f0, 0(%r1)
-0x78 0x00 0x10 0x00
-
-# CHECK: le %f0, 0(%r15)
-0x78 0x00 0xf0 0x00
-
-# CHECK: le %f0, 4095(%r1,%r15)
-0x78 0x01 0xff 0xff
-
-# CHECK: le %f0, 4095(%r15,%r1)
-0x78 0x0f 0x1f 0xff
-
-# CHECK: le %f15, 0
-0x78 0xf0 0x00 0x00
-
# CHECK: lexbr %f0, %f0
0xb3 0x46 0x00 0x00
@@ -5230,22 +6307,22 @@
# CHECK: lexbr %f13, %f13
0xb3 0x46 0x00 0xdd
-# CHECK: lexbra %f0, 0, %f0, 1
+# CHECK: lexbra %f0, 0, %f0, 1
0xb3 0x46 0x01 0x00
-# CHECK: lexbra %f0, 0, %f0, 15
+# CHECK: lexbra %f0, 0, %f0, 15
0xb3 0x46 0x0f 0x00
-# CHECK: lexbra %f0, 0, %f13, 1
+# CHECK: lexbra %f0, 0, %f13, 1
0xb3 0x46 0x01 0x0d
-# CHECK: lexbra %f0, 15, %f0, 1
+# CHECK: lexbra %f0, 15, %f0, 1
0xb3 0x46 0xf1 0x00
-# CHECK: lexbra %f4, 5, %f8, 9
+# CHECK: lexbra %f4, 5, %f8, 9
0xb3 0x46 0x59 0x48
-# CHECK: lexbra %f13, 0, %f0, 1
+# CHECK: lexbra %f13, 0, %f0, 1
0xb3 0x46 0x01 0xd0
# CHECK: ley %f0, -524288
@@ -5278,6 +6355,24 @@
# CHECK: ley %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x64
+# CHECK: lfas 0
+0xb2 0xbd 0x00 0x00
+
+# CHECK: lfas 0(%r1)
+0xb2 0xbd 0x10 0x00
+
+# CHECK: lfas 0(%r15)
+0xb2 0xbd 0xf0 0x00
+
+# CHECK: lfas 4095
+0xb2 0xbd 0x0f 0xff
+
+# CHECK: lfas 4095(%r1)
+0xb2 0xbd 0x1f 0xff
+
+# CHECK: lfas 4095(%r15)
+0xb2 0xbd 0xff 0xff
+
# CHECK: lfh %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0xca
@@ -5338,24 +6433,6 @@
# CHECK: lfhat %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0xc8
-# CHECK: lfas 0
-0xb2 0xbd 0x00 0x00
-
-# CHECK: lfas 0(%r1)
-0xb2 0xbd 0x10 0x00
-
-# CHECK: lfas 0(%r15)
-0xb2 0xbd 0xf0 0x00
-
-# CHECK: lfas 4095
-0xb2 0xbd 0x0f 0xff
-
-# CHECK: lfas 4095(%r1)
-0xb2 0xbd 0x1f 0xff
-
-# CHECK: lfas 4095(%r15)
-0xb2 0xbd 0xff 0xff
-
# CHECK: lfpc 0
0xb2 0x9d 0x00 0x00
@@ -5374,14 +6451,65 @@
# CHECK: lfpc 4095(%r15)
0xb2 0x9d 0xff 0xff
-# CHECK: lgbr %r0, %r15
-0xb9 0x06 0x00 0x0f
+# CHECK: lg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x04
-# CHECK: lgbr %r7, %r8
-0xb9 0x06 0x00 0x78
+# CHECK: lg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x04
-# CHECK: lgbr %r15, %r0
-0xb9 0x06 0x00 0xf0
+# CHECK: lg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x04
+
+# CHECK: lg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x04
+
+# CHECK: lg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x04
+
+# CHECK: lg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x04
+
+# CHECK: lg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x04
+
+# CHECK: lg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x04
+
+# CHECK: lg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x04
+
+# CHECK: lg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x04
+
+# CHECK: lgat %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x85
+
+# CHECK: lgat %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x85
+
+# CHECK: lgat %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x85
+
+# CHECK: lgat %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x85
+
+# CHECK: lgat %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x85
+
+# CHECK: lgat %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x85
+
+# CHECK: lgat %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x85
+
+# CHECK: lgat %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x85
+
+# CHECK: lgat %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x85
+
+# CHECK: lgat %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x85
# CHECK: lgb %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x77
@@ -5413,6 +6541,15 @@
# CHECK: lgb %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x77
+# CHECK: lgbr %r0, %r15
+0xb9 0x06 0x00 0x0f
+
+# CHECK: lgbr %r7, %r8
+0xb9 0x06 0x00 0x78
+
+# CHECK: lgbr %r15, %r0
+0xb9 0x06 0x00 0xf0
+
# CHECK: lgdr %r0, %f0
0xb3 0xcd 0x00 0x00
@@ -5428,33 +6565,6 @@
# CHECK: lgdr %r15, %f15
0xb3 0xcd 0x00 0xff
-# CHECK: lgfi %r0, -2147483648
-0xc0 0x01 0x80 0x00 0x00 0x00
-
-# CHECK: lgfi %r0, -1
-0xc0 0x01 0xff 0xff 0xff 0xff
-
-# CHECK: lgfi %r0, 0
-0xc0 0x01 0x00 0x00 0x00 0x00
-
-# CHECK: lgfi %r0, 1
-0xc0 0x01 0x00 0x00 0x00 0x01
-
-# CHECK: lgfi %r0, 2147483647
-0xc0 0x01 0x7f 0xff 0xff 0xff
-
-# CHECK: lgfi %r15, 0
-0xc0 0xf1 0x00 0x00 0x00 0x00
-
-# CHECK: lgfr %r0, %r15
-0xb9 0x14 0x00 0x0f
-
-# CHECK: lgfr %r7, %r8
-0xb9 0x14 0x00 0x78
-
-# CHECK: lgfr %r15, %r0
-0xb9 0x14 0x00 0xf0
-
# CHECK: lgf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x14
@@ -5485,32 +6595,32 @@
# CHECK: lgf %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x14
-# CHECK: lghi %r0, -32768
-0xa7 0x09 0x80 0x00
+# CHECK: lgfi %r0, -2147483648
+0xc0 0x01 0x80 0x00 0x00 0x00
-# CHECK: lghi %r0, -1
-0xa7 0x09 0xff 0xff
+# CHECK: lgfi %r0, -1
+0xc0 0x01 0xff 0xff 0xff 0xff
-# CHECK: lghi %r0, 0
-0xa7 0x09 0x00 0x00
+# CHECK: lgfi %r0, 0
+0xc0 0x01 0x00 0x00 0x00 0x00
-# CHECK: lghi %r0, 1
-0xa7 0x09 0x00 0x01
+# CHECK: lgfi %r0, 1
+0xc0 0x01 0x00 0x00 0x00 0x01
-# CHECK: lghi %r0, 32767
-0xa7 0x09 0x7f 0xff
+# CHECK: lgfi %r0, 2147483647
+0xc0 0x01 0x7f 0xff 0xff 0xff
-# CHECK: lghi %r15, 0
-0xa7 0xf9 0x00 0x00
+# CHECK: lgfi %r15, 0
+0xc0 0xf1 0x00 0x00 0x00 0x00
-# CHECK: lghr %r0, %r15
-0xb9 0x07 0x00 0x0f
+# CHECK: lgfr %r0, %r15
+0xb9 0x14 0x00 0x0f
-# CHECK: lghr %r7, %r8
-0xb9 0x07 0x00 0x78
+# CHECK: lgfr %r7, %r8
+0xb9 0x14 0x00 0x78
-# CHECK: lghr %r15, %r0
-0xb9 0x07 0x00 0xf0
+# CHECK: lgfr %r15, %r0
+0xb9 0x14 0x00 0xf0
# CHECK: lgh %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x15
@@ -5542,95 +6652,65 @@
# CHECK: lgh %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x15
-# CHECK: lgr %r0, %r9
-0xb9 0x04 0x00 0x09
-
-# CHECK: lgr %r0, %r15
-0xb9 0x04 0x00 0x0f
-
-# CHECK: lgr %r15, %r0
-0xb9 0x04 0x00 0xf0
-
-# CHECK: lgr %r15, %r9
-0xb9 0x04 0x00 0xf9
-
-# CHECK: lg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x04
-
-# CHECK: lg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x04
-
-# CHECK: lg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x04
-
-# CHECK: lg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x04
-
-# CHECK: lg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x04
-
-# CHECK: lg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x04
-
-# CHECK: lg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x04
+# CHECK: lghi %r0, -32768
+0xa7 0x09 0x80 0x00
-# CHECK: lg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x04
+# CHECK: lghi %r0, -1
+0xa7 0x09 0xff 0xff
-# CHECK: lg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x04
+# CHECK: lghi %r0, 0
+0xa7 0x09 0x00 0x00
-# CHECK: lg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x04
+# CHECK: lghi %r0, 1
+0xa7 0x09 0x00 0x01
-# CHECK: lgat %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x85
+# CHECK: lghi %r0, 32767
+0xa7 0x09 0x7f 0xff
-# CHECK: lgat %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x85
+# CHECK: lghi %r15, 0
+0xa7 0xf9 0x00 0x00
-# CHECK: lgat %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x85
+# CHECK: lghr %r0, %r15
+0xb9 0x07 0x00 0x0f
-# CHECK: lgat %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x85
+# CHECK: lghr %r7, %r8
+0xb9 0x07 0x00 0x78
-# CHECK: lgat %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x85
+# CHECK: lghr %r15, %r0
+0xb9 0x07 0x00 0xf0
-# CHECK: lgat %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x85
+# CHECK: lgr %r0, %r9
+0xb9 0x04 0x00 0x09
-# CHECK: lgat %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x85
+# CHECK: lgr %r0, %r15
+0xb9 0x04 0x00 0x0f
-# CHECK: lgat %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x85
+# CHECK: lgr %r15, %r0
+0xb9 0x04 0x00 0xf0
-# CHECK: lgat %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x85
+# CHECK: lgr %r15, %r9
+0xb9 0x04 0x00 0xf9
-# CHECK: lgat %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x85
+# CHECK: lh %r0, 0
+0x48 0x00 0x00 0x00
-# CHECK: lhi %r0, -32768
-0xa7 0x08 0x80 0x00
+# CHECK: lh %r0, 4095
+0x48 0x00 0x0f 0xff
-# CHECK: lhi %r0, -1
-0xa7 0x08 0xff 0xff
+# CHECK: lh %r0, 0(%r1)
+0x48 0x00 0x10 0x00
-# CHECK: lhi %r0, 0
-0xa7 0x08 0x00 0x00
+# CHECK: lh %r0, 0(%r15)
+0x48 0x00 0xf0 0x00
-# CHECK: lhi %r0, 1
-0xa7 0x08 0x00 0x01
+# CHECK: lh %r0, 4095(%r1,%r15)
+0x48 0x01 0xff 0xff
-# CHECK: lhi %r0, 32767
-0xa7 0x08 0x7f 0xff
+# CHECK: lh %r0, 4095(%r15,%r1)
+0x48 0x0f 0x1f 0xff
-# CHECK: lhi %r15, 0
-0xa7 0xf8 0x00 0x00
+# CHECK: lh %r15, 0
+0x48 0xf0 0x00 0x00
# CHECK: lhh %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0xc4
@@ -5662,35 +6742,32 @@
# CHECK: lhh %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0xc4
-# CHECK: lhr %r0, %r15
-0xb9 0x27 0x00 0x0f
-
-# CHECK: lhr %r7, %r8
-0xb9 0x27 0x00 0x78
+# CHECK: lhi %r0, -32768
+0xa7 0x08 0x80 0x00
-# CHECK: lhr %r15, %r0
-0xb9 0x27 0x00 0xf0
+# CHECK: lhi %r0, -1
+0xa7 0x08 0xff 0xff
-# CHECK: lh %r0, 0
-0x48 0x00 0x00 0x00
+# CHECK: lhi %r0, 0
+0xa7 0x08 0x00 0x00
-# CHECK: lh %r0, 4095
-0x48 0x00 0x0f 0xff
+# CHECK: lhi %r0, 1
+0xa7 0x08 0x00 0x01
-# CHECK: lh %r0, 0(%r1)
-0x48 0x00 0x10 0x00
+# CHECK: lhi %r0, 32767
+0xa7 0x08 0x7f 0xff
-# CHECK: lh %r0, 0(%r15)
-0x48 0x00 0xf0 0x00
+# CHECK: lhi %r15, 0
+0xa7 0xf8 0x00 0x00
-# CHECK: lh %r0, 4095(%r1,%r15)
-0x48 0x01 0xff 0xff
+# CHECK: lhr %r0, %r15
+0xb9 0x27 0x00 0x0f
-# CHECK: lh %r0, 4095(%r15,%r1)
-0x48 0x0f 0x1f 0xff
+# CHECK: lhr %r7, %r8
+0xb9 0x27 0x00 0x78
-# CHECK: lh %r15, 0
-0x48 0xf0 0x00 0x00
+# CHECK: lhr %r15, %r0
+0xb9 0x27 0x00 0xf0
# CHECK: lhy %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x78
@@ -5722,15 +6799,6 @@
# CHECK: lhy %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x78
-# CHECK: llcr %r0, %r15
-0xb9 0x94 0x00 0x0f
-
-# CHECK: llcr %r7, %r8
-0xb9 0x94 0x00 0x78
-
-# CHECK: llcr %r15, %r0
-0xb9 0x94 0x00 0xf0
-
# CHECK: llc %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x94
@@ -5791,14 +6859,14 @@
# CHECK: llch %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0xc2
-# CHECK: llgcr %r0, %r15
-0xb9 0x84 0x00 0x0f
+# CHECK: llcr %r0, %r15
+0xb9 0x94 0x00 0x0f
-# CHECK: llgcr %r7, %r8
-0xb9 0x84 0x00 0x78
+# CHECK: llcr %r7, %r8
+0xb9 0x94 0x00 0x78
-# CHECK: llgcr %r15, %r0
-0xb9 0x84 0x00 0xf0
+# CHECK: llcr %r15, %r0
+0xb9 0x94 0x00 0xf0
# CHECK: llgc %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x90
@@ -5830,14 +6898,14 @@
# CHECK: llgc %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x90
-# CHECK: llgfr %r0, %r15
-0xb9 0x16 0x00 0x0f
+# CHECK: llgcr %r0, %r15
+0xb9 0x84 0x00 0x0f
-# CHECK: llgfr %r7, %r8
-0xb9 0x16 0x00 0x78
+# CHECK: llgcr %r7, %r8
+0xb9 0x84 0x00 0x78
-# CHECK: llgfr %r15, %r0
-0xb9 0x16 0x00 0xf0
+# CHECK: llgcr %r15, %r0
+0xb9 0x84 0x00 0xf0
# CHECK: llgf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x16
@@ -5899,14 +6967,53 @@
# CHECK: llgfat %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x9d
-# CHECK: llgtr %r0, %r15
-0xb9 0x17 0x00 0x0f
+# CHECK: llgfr %r0, %r15
+0xb9 0x16 0x00 0x0f
-# CHECK: llgtr %r7, %r8
-0xb9 0x17 0x00 0x78
+# CHECK: llgfr %r7, %r8
+0xb9 0x16 0x00 0x78
-# CHECK: llgtr %r15, %r0
-0xb9 0x17 0x00 0xf0
+# CHECK: llgfr %r15, %r0
+0xb9 0x16 0x00 0xf0
+
+# CHECK: llgh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x91
+
+# CHECK: llgh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x91
+
+# CHECK: llgh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x91
+
+# CHECK: llgh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x91
+
+# CHECK: llgh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x91
+
+# CHECK: llgh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x91
+
+# CHECK: llgh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x91
+
+# CHECK: llgh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x91
+
+# CHECK: llgh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x91
+
+# CHECK: llgh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x91
+
+# CHECK: llghr %r0, %r15
+0xb9 0x85 0x00 0x0f
+
+# CHECK: llghr %r7, %r8
+0xb9 0x85 0x00 0x78
+
+# CHECK: llghr %r15, %r0
+0xb9 0x85 0x00 0xf0
# CHECK: llgt %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x17
@@ -5968,53 +7075,14 @@
# CHECK: llgtat %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x9c
-# CHECK: llghr %r0, %r15
-0xb9 0x85 0x00 0x0f
-
-# CHECK: llghr %r7, %r8
-0xb9 0x85 0x00 0x78
-
-# CHECK: llghr %r15, %r0
-0xb9 0x85 0x00 0xf0
-
-# CHECK: llgh %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x91
-
-# CHECK: llgh %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x91
-
-# CHECK: llgh %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x91
-
-# CHECK: llgh %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x91
-
-# CHECK: llgh %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x91
-
-# CHECK: llgh %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x91
-
-# CHECK: llgh %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x91
-
-# CHECK: llgh %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x91
-
-# CHECK: llgh %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x91
-
-# CHECK: llgh %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x91
-
-# CHECK: llhr %r0, %r15
-0xb9 0x95 0x00 0x0f
+# CHECK: llgtr %r0, %r15
+0xb9 0x17 0x00 0x0f
-# CHECK: llhr %r7, %r8
-0xb9 0x95 0x00 0x78
+# CHECK: llgtr %r7, %r8
+0xb9 0x17 0x00 0x78
-# CHECK: llhr %r15, %r0
-0xb9 0x95 0x00 0xf0
+# CHECK: llgtr %r15, %r0
+0xb9 0x17 0x00 0xf0
# CHECK: llh %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x95
@@ -6076,6 +7144,15 @@
# CHECK: llhh %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0xc6
+# CHECK: llhr %r0, %r15
+0xb9 0x95 0x00 0x0f
+
+# CHECK: llhr %r7, %r8
+0xb9 0x95 0x00 0x78
+
+# CHECK: llhr %r15, %r0
+0xb9 0x95 0x00 0xf0
+
# CHECK: llihf %r0, 0
0xc0 0x0e 0x00 0x00 0x00 0x00
@@ -6172,6 +7249,27 @@
# CHECK: lm %r0, %r0, 4095(%r15)
0x98 0x00 0xff 0xff
+# CHECK: lmd %r0, %r0, 0, 0
+0xef 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: lmd %r2, %r4, 0, 4095
+0xef 0x24 0x00 0x00 0x0f 0xff
+
+# CHECK: lmd %r2, %r4, 0, 0(%r1)
+0xef 0x24 0x00 0x00 0x10 0x00
+
+# CHECK: lmd %r2, %r4, 0, 0(%r15)
+0xef 0x24 0x00 0x00 0xf0 0x00
+
+# CHECK: lmd %r2, %r4, 0(%r1), 4095(%r15)
+0xef 0x24 0x10 0x00 0xff 0xff
+
+# CHECK: lmd %r2, %r4, 0(%r1), 0(%r15)
+0xef 0x24 0x10 0x00 0xf0 0x00
+
+# CHECK: lmd %r2, %r4, 4095(%r1), 0(%r15)
+0xef 0x24 0x1f 0xff 0xf0 0x00
+
# CHECK: lmg %r0, %r0, 0
0xeb 0x00 0x00 0x00 0x00 0x04
@@ -6574,6 +7672,18 @@
# CHECK: lpd %r2, 4095(%r1), 0(%r15)
0xc8 0x24 0x1f 0xff 0xf0 0x00
+# CHECK: lpdbr %f0, %f9
+0xb3 0x10 0x00 0x09
+
+# CHECK: lpdbr %f0, %f15
+0xb3 0x10 0x00 0x0f
+
+# CHECK: lpdbr %f15, %f0
+0xb3 0x10 0x00 0xf0
+
+# CHECK: lpdbr %f15, %f9
+0xb3 0x10 0x00 0xf9
+
# CHECK: lpdg %r0, 0, 0
0xc8 0x05 0x00 0x00 0x00 0x00
@@ -6595,18 +7705,6 @@
# CHECK: lpdg %r2, 4095(%r1), 0(%r15)
0xc8 0x25 0x1f 0xff 0xf0 0x00
-# CHECK: lpdbr %f0, %f9
-0xb3 0x10 0x00 0x09
-
-# CHECK: lpdbr %f0, %f15
-0xb3 0x10 0x00 0x0f
-
-# CHECK: lpdbr %f15, %f0
-0xb3 0x10 0x00 0xf0
-
-# CHECK: lpdbr %f15, %f9
-0xb3 0x10 0x00 0xf9
-
# CHECK: lpebr %f0, %f9
0xb3 0x00 0x00 0x09
@@ -6643,18 +7741,6 @@
# CHECK: lpgr %r7, %r8
0xb9 0x00 0x00 0x78
-# CHECK: lpr %r0, %r0
-0x10 0x00
-
-# CHECK: lpr %r0, %r15
-0x10 0x0f
-
-# CHECK: lpr %r15, %r0
-0x10 0xf0
-
-# CHECK: lpr %r7, %r8
-0x10 0x78
-
# CHECK: lpq %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x8f
@@ -6685,6 +7771,18 @@
# CHECK: lpq %r14, 0
0xe3 0xe0 0x00 0x00 0x00 0x8f
+# CHECK: lpr %r0, %r0
+0x10 0x00
+
+# CHECK: lpr %r0, %r15
+0x10 0x0f
+
+# CHECK: lpr %r15, %r0
+0x10 0xf0
+
+# CHECK: lpr %r7, %r8
+0x10 0x78
+
# CHECK: lpxbr %f0, %f8
0xb3 0x40 0x00 0x08
@@ -6709,20 +7807,35 @@
# CHECK: lr %r15, %r9
0x18 0xf9
-# CHECK: lrvgr %r0, %r0
-0xb9 0x0f 0x00 0x00
+# CHECK: lrv %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x1e
-# CHECK: lrvgr %r0, %r15
-0xb9 0x0f 0x00 0x0f
+# CHECK: lrv %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x1e
-# CHECK: lrvgr %r15, %r0
-0xb9 0x0f 0x00 0xf0
+# CHECK: lrv %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x1e
-# CHECK: lrvgr %r7, %r8
-0xb9 0x0f 0x00 0x78
+# CHECK: lrv %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x1e
-# CHECK: lrvgr %r15, %r15
-0xb9 0x0f 0x00 0xff
+# CHECK: lrv %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x1e
+
+# CHECK: lrv %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x1e
+
+# CHECK: lrv %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x1e
+
+# CHECK: lrv %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x1e
+
+# CHECK: lrv %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x1e
+
+# CHECK: lrv %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x1e
# CHECK: lrvg %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x0f
@@ -6754,20 +7867,20 @@
# CHECK: lrvg %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x0f
-# CHECK: lrvr %r0, %r0
-0xb9 0x1f 0x00 0x00
+# CHECK: lrvgr %r0, %r0
+0xb9 0x0f 0x00 0x00
-# CHECK: lrvr %r0, %r15
-0xb9 0x1f 0x00 0x0f
+# CHECK: lrvgr %r0, %r15
+0xb9 0x0f 0x00 0x0f
-# CHECK: lrvr %r15, %r0
-0xb9 0x1f 0x00 0xf0
+# CHECK: lrvgr %r15, %r0
+0xb9 0x0f 0x00 0xf0
-# CHECK: lrvr %r7, %r8
-0xb9 0x1f 0x00 0x78
+# CHECK: lrvgr %r7, %r8
+0xb9 0x0f 0x00 0x78
-# CHECK: lrvr %r15, %r15
-0xb9 0x1f 0x00 0xff
+# CHECK: lrvgr %r15, %r15
+0xb9 0x0f 0x00 0xff
# CHECK: lrvh %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x1f
@@ -6799,86 +7912,20 @@
# CHECK: lrvh %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x1f
-# CHECK: lrv %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x1e
-
-# CHECK: lrv %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x1e
-
-# CHECK: lrv %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x1e
-
-# CHECK: lrv %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x1e
-
-# CHECK: lrv %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x1e
-
-# CHECK: lrv %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x1e
-
-# CHECK: lrv %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x1e
-
-# CHECK: lrv %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x1e
-
-# CHECK: lrv %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x1e
-
-# CHECK: lrv %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x1e
-
-# CHECK: l %r0, 0
-0x58 0x00 0x00 0x00
-
-# CHECK: l %r0, 4095
-0x58 0x00 0x0f 0xff
-
-# CHECK: l %r0, 0(%r1)
-0x58 0x00 0x10 0x00
-
-# CHECK: l %r0, 0(%r15)
-0x58 0x00 0xf0 0x00
-
-# CHECK: l %r0, 4095(%r1,%r15)
-0x58 0x01 0xff 0xff
-
-# CHECK: l %r0, 4095(%r15,%r1)
-0x58 0x0f 0x1f 0xff
-
-# CHECK: l %r15, 0
-0x58 0xf0 0x00 0x00
-
-# CHECK: lat %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x9f
-
-# CHECK: lat %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x9f
-
-# CHECK: lat %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x9f
-
-# CHECK: lat %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x9f
-
-# CHECK: lat %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x9f
-
-# CHECK: lat %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x9f
+# CHECK: lrvr %r0, %r0
+0xb9 0x1f 0x00 0x00
-# CHECK: lat %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x9f
+# CHECK: lrvr %r0, %r15
+0xb9 0x1f 0x00 0x0f
-# CHECK: lat %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x9f
+# CHECK: lrvr %r15, %r0
+0xb9 0x1f 0x00 0xf0
-# CHECK: lat %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x9f
+# CHECK: lrvr %r7, %r8
+0xb9 0x1f 0x00 0x78
-# CHECK: lat %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x9f
+# CHECK: lrvr %r15, %r15
+0xb9 0x1f 0x00 0xff
# CHECK: lt %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x12
@@ -7111,23 +8158,26 @@
# CHECK: lzxr %f13
0xb3 0x76 0x00 0xd0
-# CHECK: madbr %f0, %f0, %f0
-0xb3 0x1e 0x00 0x00
+# CHECK: m %r0, 0
+0x5c 0x00 0x00 0x00
-# CHECK: madbr %f0, %f0, %f15
-0xb3 0x1e 0x00 0x0f
+# CHECK: m %r0, 4095
+0x5c 0x00 0x0f 0xff
-# CHECK: madbr %f0, %f15, %f0
-0xb3 0x1e 0x00 0xf0
+# CHECK: m %r0, 0(%r1)
+0x5c 0x00 0x10 0x00
-# CHECK: madbr %f15, %f0, %f0
-0xb3 0x1e 0xf0 0x00
+# CHECK: m %r0, 0(%r15)
+0x5c 0x00 0xf0 0x00
-# CHECK: madbr %f7, %f8, %f9
-0xb3 0x1e 0x70 0x89
+# CHECK: m %r0, 4095(%r1,%r15)
+0x5c 0x01 0xff 0xff
-# CHECK: madbr %f15, %f15, %f15
-0xb3 0x1e 0xf0 0xff
+# CHECK: m %r0, 4095(%r15,%r1)
+0x5c 0x0f 0x1f 0xff
+
+# CHECK: m %r14, 0
+0x5c 0xe0 0x00 0x00
# CHECK: madb %f0, %f0, 0
0xed 0x00 0x00 0x00 0x00 0x1e
@@ -7156,23 +8206,23 @@
# CHECK: madb %f15, %f15, 0
0xed 0xf0 0x00 0x00 0xf0 0x1e
-# CHECK: maebr %f0, %f0, %f0
-0xb3 0x0e 0x00 0x00
+# CHECK: madbr %f0, %f0, %f0
+0xb3 0x1e 0x00 0x00
-# CHECK: maebr %f0, %f0, %f15
-0xb3 0x0e 0x00 0x0f
+# CHECK: madbr %f0, %f0, %f15
+0xb3 0x1e 0x00 0x0f
-# CHECK: maebr %f0, %f15, %f0
-0xb3 0x0e 0x00 0xf0
+# CHECK: madbr %f0, %f15, %f0
+0xb3 0x1e 0x00 0xf0
-# CHECK: maebr %f15, %f0, %f0
-0xb3 0x0e 0xf0 0x00
+# CHECK: madbr %f15, %f0, %f0
+0xb3 0x1e 0xf0 0x00
-# CHECK: maebr %f7, %f8, %f9
-0xb3 0x0e 0x70 0x89
+# CHECK: madbr %f7, %f8, %f9
+0xb3 0x1e 0x70 0x89
-# CHECK: maebr %f15, %f15, %f15
-0xb3 0x0e 0xf0 0xff
+# CHECK: madbr %f15, %f15, %f15
+0xb3 0x1e 0xf0 0xff
# CHECK: maeb %f0, %f0, 0
0xed 0x00 0x00 0x00 0x00 0x0e
@@ -7201,17 +8251,44 @@
# CHECK: maeb %f15, %f15, 0
0xed 0xf0 0x00 0x00 0xf0 0x0e
-# CHECK: mdbr %f0, %f0
-0xb3 0x1c 0x00 0x00
+# CHECK: maebr %f0, %f0, %f0
+0xb3 0x0e 0x00 0x00
-# CHECK: mdbr %f0, %f15
-0xb3 0x1c 0x00 0x0f
+# CHECK: maebr %f0, %f0, %f15
+0xb3 0x0e 0x00 0x0f
-# CHECK: mdbr %f7, %f8
-0xb3 0x1c 0x00 0x78
+# CHECK: maebr %f0, %f15, %f0
+0xb3 0x0e 0x00 0xf0
-# CHECK: mdbr %f15, %f0
-0xb3 0x1c 0x00 0xf0
+# CHECK: maebr %f15, %f0, %f0
+0xb3 0x0e 0xf0 0x00
+
+# CHECK: maebr %f7, %f8, %f9
+0xb3 0x0e 0x70 0x89
+
+# CHECK: maebr %f15, %f15, %f15
+0xb3 0x0e 0xf0 0xff
+
+# CHECK: mc 0, 0
+0xaf 0x00 0x00 0x00
+
+# CHECK: mc 4095, 0
+0xaf 0x00 0x0f 0xff
+
+# CHECK: mc 0, 255
+0xaf 0xff 0x00 0x00
+
+# CHECK: mc 0(%r1), 42
+0xaf 0x2a 0x10 0x00
+
+# CHECK: mc 0(%r15), 42
+0xaf 0x2a 0xf0 0x00
+
+# CHECK: mc 4095(%r1), 42
+0xaf 0x2a 0x1f 0xff
+
+# CHECK: mc 4095(%r15), 42
+0xaf 0x2a 0xff 0xff
# CHECK: mdb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x1c
@@ -7234,17 +8311,17 @@
# CHECK: mdb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x1c
-# CHECK: mdebr %f0, %f0
-0xb3 0x0c 0x00 0x00
+# CHECK: mdbr %f0, %f0
+0xb3 0x1c 0x00 0x00
-# CHECK: mdebr %f0, %f15
-0xb3 0x0c 0x00 0x0f
+# CHECK: mdbr %f0, %f15
+0xb3 0x1c 0x00 0x0f
-# CHECK: mdebr %f7, %f8
-0xb3 0x0c 0x00 0x78
+# CHECK: mdbr %f7, %f8
+0xb3 0x1c 0x00 0x78
-# CHECK: mdebr %f15, %f0
-0xb3 0x0c 0x00 0xf0
+# CHECK: mdbr %f15, %f0
+0xb3 0x1c 0x00 0xf0
# CHECK: mdeb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x0c
@@ -7267,17 +8344,17 @@
# CHECK: mdeb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x0c
-# CHECK: meebr %f0, %f0
-0xb3 0x17 0x00 0x00
+# CHECK: mdebr %f0, %f0
+0xb3 0x0c 0x00 0x00
-# CHECK: meebr %f0, %f15
-0xb3 0x17 0x00 0x0f
+# CHECK: mdebr %f0, %f15
+0xb3 0x0c 0x00 0x0f
-# CHECK: meebr %f7, %f8
-0xb3 0x17 0x00 0x78
+# CHECK: mdebr %f7, %f8
+0xb3 0x0c 0x00 0x78
-# CHECK: meebr %f15, %f0
-0xb3 0x17 0x00 0xf0
+# CHECK: mdebr %f15, %f0
+0xb3 0x0c 0x00 0xf0
# CHECK: meeb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x17
@@ -7300,6 +8377,48 @@
# CHECK: meeb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x17
+# CHECK: meebr %f0, %f0
+0xb3 0x17 0x00 0x00
+
+# CHECK: meebr %f0, %f15
+0xb3 0x17 0x00 0x0f
+
+# CHECK: meebr %f7, %f8
+0xb3 0x17 0x00 0x78
+
+# CHECK: meebr %f15, %f0
+0xb3 0x17 0x00 0xf0
+
+# CHECK: mfy %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x5c
+
+# CHECK: mfy %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x5c
+
+# CHECK: mfy %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x5c
+
+# CHECK: mfy %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x5c
+
+# CHECK: mfy %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x5c
+
+# CHECK: mfy %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x5c
+
+# CHECK: mfy %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x5c
+
+# CHECK: mfy %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x5c
+
+# CHECK: mfy %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x5c
+
+# CHECK: mfy %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x5c
+
# CHECK: mghi %r0, -32768
0xa7 0x0d 0x80 0x00
@@ -7318,24 +8437,6 @@
# CHECK: mghi %r15, 0
0xa7 0xfd 0x00 0x00
-# CHECK: mhi %r0, -32768
-0xa7 0x0c 0x80 0x00
-
-# CHECK: mhi %r0, -1
-0xa7 0x0c 0xff 0xff
-
-# CHECK: mhi %r0, 0
-0xa7 0x0c 0x00 0x00
-
-# CHECK: mhi %r0, 1
-0xa7 0x0c 0x00 0x01
-
-# CHECK: mhi %r0, 32767
-0xa7 0x0c 0x7f 0xff
-
-# CHECK: mhi %r15, 0
-0xa7 0xfc 0x00 0x00
-
# CHECK: mh %r0, 0
0x4c 0x00 0x00 0x00
@@ -7357,6 +8458,24 @@
# CHECK: mh %r15, 0
0x4c 0xf0 0x00 0x00
+# CHECK: mhi %r0, -32768
+0xa7 0x0c 0x80 0x00
+
+# CHECK: mhi %r0, -1
+0xa7 0x0c 0xff 0xff
+
+# CHECK: mhi %r0, 0
+0xa7 0x0c 0x00 0x00
+
+# CHECK: mhi %r0, 1
+0xa7 0x0c 0x00 0x01
+
+# CHECK: mhi %r0, 32767
+0xa7 0x0c 0x7f 0xff
+
+# CHECK: mhi %r15, 0
+0xa7 0xfc 0x00 0x00
+
# CHECK: mhy %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x7c
@@ -7387,17 +8506,35 @@
# CHECK: mhy %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x7c
-# CHECK: mlgr %r0, %r0
-0xb9 0x86 0x00 0x00
+# CHECK: ml %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x96
-# CHECK: mlgr %r0, %r15
-0xb9 0x86 0x00 0x0f
+# CHECK: ml %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x96
-# CHECK: mlgr %r14, %r0
-0xb9 0x86 0x00 0xe0
+# CHECK: ml %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x96
-# CHECK: mlgr %r6, %r9
-0xb9 0x86 0x00 0x69
+# CHECK: ml %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x96
+
+# CHECK: ml %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x96
+
+# CHECK: ml %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x96
+
+# CHECK: ml %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x96
+
+# CHECK: ml %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x96
+
+# CHECK: ml %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x96
+
+# CHECK: ml %r14, 0
+0xe3 0xe0 0x00 0x00 0x00 0x96
# CHECK: mlg %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x86
@@ -7429,23 +8566,104 @@
# CHECK: mlg %r14, 0
0xe3 0xe0 0x00 0x00 0x00 0x86
-# CHECK: msdbr %f0, %f0, %f0
-0xb3 0x1f 0x00 0x00
+# CHECK: mlgr %r0, %r0
+0xb9 0x86 0x00 0x00
-# CHECK: msdbr %f0, %f0, %f15
-0xb3 0x1f 0x00 0x0f
+# CHECK: mlgr %r0, %r15
+0xb9 0x86 0x00 0x0f
-# CHECK: msdbr %f0, %f15, %f0
-0xb3 0x1f 0x00 0xf0
+# CHECK: mlgr %r14, %r0
+0xb9 0x86 0x00 0xe0
-# CHECK: msdbr %f15, %f0, %f0
-0xb3 0x1f 0xf0 0x00
+# CHECK: mlgr %r6, %r9
+0xb9 0x86 0x00 0x69
-# CHECK: msdbr %f7, %f8, %f9
-0xb3 0x1f 0x70 0x89
+# CHECK: mlr %r0, %r0
+0xb9 0x96 0x00 0x00
-# CHECK: msdbr %f15, %f15, %f15
-0xb3 0x1f 0xf0 0xff
+# CHECK: mlr %r0, %r15
+0xb9 0x96 0x00 0x0f
+
+# CHECK: mlr %r14, %r0
+0xb9 0x96 0x00 0xe0
+
+# CHECK: mlr %r6, %r9
+0xb9 0x96 0x00 0x69
+
+# CHECK: mp 0(1), 0(1)
+0xfc 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mp 0(1), 0(1,%r1)
+0xfc 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mp 0(1), 0(1,%r15)
+0xfc 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mp 0(1), 4095(1)
+0xfc 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mp 0(1), 4095(1,%r1)
+0xfc 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mp 0(1), 4095(1,%r15)
+0xfc 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mp 0(1,%r1), 0(1)
+0xfc 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mp 0(1,%r15), 0(1)
+0xfc 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mp 4095(1,%r1), 0(1)
+0xfc 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mp 4095(1,%r15), 0(1)
+0xfc 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mp 0(16,%r1), 0(1)
+0xfc 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: mp 0(16,%r15), 0(1)
+0xfc 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: mp 0(1), 0(16,%r1)
+0xfc 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: mp 0(1), 0(16,%r15)
+0xfc 0x0f 0x00 0x00 0xf0 0x00
+
+# CHECK: mr %r0, %r0
+0x1c 0x00
+
+# CHECK: mr %r0, %r15
+0x1c 0x0f
+
+# CHECK: mr %r14, %r0
+0x1c 0xe0
+
+# CHECK: mr %r6, %r9
+0x1c 0x69
+
+# CHECK: ms %r0, 0
+0x71 0x00 0x00 0x00
+
+# CHECK: ms %r0, 4095
+0x71 0x00 0x0f 0xff
+
+# CHECK: ms %r0, 0(%r1)
+0x71 0x00 0x10 0x00
+
+# CHECK: ms %r0, 0(%r15)
+0x71 0x00 0xf0 0x00
+
+# CHECK: ms %r0, 4095(%r1,%r15)
+0x71 0x01 0xff 0xff
+
+# CHECK: ms %r0, 4095(%r15,%r1)
+0x71 0x0f 0x1f 0xff
+
+# CHECK: ms %r15, 0
+0x71 0xf0 0x00 0x00
# CHECK: msdb %f0, %f0, 0
0xed 0x00 0x00 0x00 0x00 0x1f
@@ -7474,23 +8692,23 @@
# CHECK: msdb %f15, %f15, 0
0xed 0xf0 0x00 0x00 0xf0 0x1f
-# CHECK: msebr %f0, %f0, %f0
-0xb3 0x0f 0x00 0x00
+# CHECK: msdbr %f0, %f0, %f0
+0xb3 0x1f 0x00 0x00
-# CHECK: msebr %f0, %f0, %f15
-0xb3 0x0f 0x00 0x0f
+# CHECK: msdbr %f0, %f0, %f15
+0xb3 0x1f 0x00 0x0f
-# CHECK: msebr %f0, %f15, %f0
-0xb3 0x0f 0x00 0xf0
+# CHECK: msdbr %f0, %f15, %f0
+0xb3 0x1f 0x00 0xf0
-# CHECK: msebr %f15, %f0, %f0
-0xb3 0x0f 0xf0 0x00
+# CHECK: msdbr %f15, %f0, %f0
+0xb3 0x1f 0xf0 0x00
-# CHECK: msebr %f7, %f8, %f9
-0xb3 0x0f 0x70 0x89
+# CHECK: msdbr %f7, %f8, %f9
+0xb3 0x1f 0x70 0x89
-# CHECK: msebr %f15, %f15, %f15
-0xb3 0x0f 0xf0 0xff
+# CHECK: msdbr %f15, %f15, %f15
+0xb3 0x1f 0xf0 0xff
# CHECK: mseb %f0, %f0, 0
0xed 0x00 0x00 0x00 0x00 0x0f
@@ -7519,6 +8737,24 @@
# CHECK: mseb %f15, %f15, 0
0xed 0xf0 0x00 0x00 0xf0 0x0f
+# CHECK: msebr %f0, %f0, %f0
+0xb3 0x0f 0x00 0x00
+
+# CHECK: msebr %f0, %f0, %f15
+0xb3 0x0f 0x00 0x0f
+
+# CHECK: msebr %f0, %f15, %f0
+0xb3 0x0f 0x00 0xf0
+
+# CHECK: msebr %f15, %f0, %f0
+0xb3 0x0f 0xf0 0x00
+
+# CHECK: msebr %f7, %f8, %f9
+0xb3 0x0f 0x70 0x89
+
+# CHECK: msebr %f15, %f15, %f15
+0xb3 0x0f 0xf0 0xff
+
# CHECK: msfi %r0, -2147483648
0xc2 0x01 0x80 0x00 0x00 0x00
@@ -7537,35 +8773,35 @@
# CHECK: msfi %r15, 0
0xc2 0xf1 0x00 0x00 0x00 0x00
-# CHECK: msgfi %r0, -2147483648
-0xc2 0x00 0x80 0x00 0x00 0x00
+# CHECK: msg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x0c
-# CHECK: msgfi %r0, -1
-0xc2 0x00 0xff 0xff 0xff 0xff
+# CHECK: msg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x0c
-# CHECK: msgfi %r0, 0
-0xc2 0x00 0x00 0x00 0x00 0x00
+# CHECK: msg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x0c
-# CHECK: msgfi %r0, 1
-0xc2 0x00 0x00 0x00 0x00 0x01
+# CHECK: msg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x0c
-# CHECK: msgfi %r0, 2147483647
-0xc2 0x00 0x7f 0xff 0xff 0xff
+# CHECK: msg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x0c
-# CHECK: msgfi %r15, 0
-0xc2 0xf0 0x00 0x00 0x00 0x00
+# CHECK: msg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x0c
-# CHECK: msgfr %r0, %r0
-0xb9 0x1c 0x00 0x00
+# CHECK: msg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x0c
-# CHECK: msgfr %r0, %r15
-0xb9 0x1c 0x00 0x0f
+# CHECK: msg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x0c
-# CHECK: msgfr %r15, %r0
-0xb9 0x1c 0x00 0xf0
+# CHECK: msg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x0c
-# CHECK: msgfr %r7, %r8
-0xb9 0x1c 0x00 0x78
+# CHECK: msg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x0c
# CHECK: msgf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x1c
@@ -7597,47 +8833,47 @@
# CHECK: msgf %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x1c
-# CHECK: msgr %r0, %r0
-0xb9 0x0c 0x00 0x00
+# CHECK: msgfi %r0, -2147483648
+0xc2 0x00 0x80 0x00 0x00 0x00
-# CHECK: msgr %r0, %r15
-0xb9 0x0c 0x00 0x0f
+# CHECK: msgfi %r0, -1
+0xc2 0x00 0xff 0xff 0xff 0xff
-# CHECK: msgr %r15, %r0
-0xb9 0x0c 0x00 0xf0
+# CHECK: msgfi %r0, 0
+0xc2 0x00 0x00 0x00 0x00 0x00
-# CHECK: msgr %r7, %r8
-0xb9 0x0c 0x00 0x78
+# CHECK: msgfi %r0, 1
+0xc2 0x00 0x00 0x00 0x00 0x01
-# CHECK: msg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x0c
+# CHECK: msgfi %r0, 2147483647
+0xc2 0x00 0x7f 0xff 0xff 0xff
-# CHECK: msg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x0c
+# CHECK: msgfi %r15, 0
+0xc2 0xf0 0x00 0x00 0x00 0x00
-# CHECK: msg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x0c
+# CHECK: msgfr %r0, %r0
+0xb9 0x1c 0x00 0x00
-# CHECK: msg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x0c
+# CHECK: msgfr %r0, %r15
+0xb9 0x1c 0x00 0x0f
-# CHECK: msg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x0c
+# CHECK: msgfr %r15, %r0
+0xb9 0x1c 0x00 0xf0
-# CHECK: msg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x0c
+# CHECK: msgfr %r7, %r8
+0xb9 0x1c 0x00 0x78
-# CHECK: msg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x0c
+# CHECK: msgr %r0, %r0
+0xb9 0x0c 0x00 0x00
-# CHECK: msg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x0c
+# CHECK: msgr %r0, %r15
+0xb9 0x0c 0x00 0x0f
-# CHECK: msg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x0c
+# CHECK: msgr %r15, %r0
+0xb9 0x0c 0x00 0xf0
-# CHECK: msg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x0c
+# CHECK: msgr %r7, %r8
+0xb9 0x0c 0x00 0x78
# CHECK: msr %r0, %r0
0xb2 0x52 0x00 0x00
@@ -7651,27 +8887,6 @@
# CHECK: msr %r7, %r8
0xb2 0x52 0x00 0x78
-# CHECK: ms %r0, 0
-0x71 0x00 0x00 0x00
-
-# CHECK: ms %r0, 4095
-0x71 0x00 0x0f 0xff
-
-# CHECK: ms %r0, 0(%r1)
-0x71 0x00 0x10 0x00
-
-# CHECK: ms %r0, 0(%r15)
-0x71 0x00 0xf0 0x00
-
-# CHECK: ms %r0, 4095(%r1,%r15)
-0x71 0x01 0xff 0xff
-
-# CHECK: ms %r0, 4095(%r15,%r1)
-0x71 0x0f 0x1f 0xff
-
-# CHECK: ms %r15, 0
-0x71 0xf0 0x00 0x00
-
# CHECK: msy %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x51
@@ -7738,6 +8953,42 @@
# CHECK: mvc 0(256,%r15), 0
0xd2 0xff 0xf0 0x00 0x00 0x00
+# CHECK: mvcin 0(1), 0
+0xe8 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mvcin 0(1), 0(%r1)
+0xe8 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mvcin 0(1), 0(%r15)
+0xe8 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mvcin 0(1), 4095
+0xe8 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mvcin 0(1), 4095(%r1)
+0xe8 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mvcin 0(1), 4095(%r15)
+0xe8 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mvcin 0(1,%r1), 0
+0xe8 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mvcin 0(1,%r15), 0
+0xe8 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mvcin 4095(1,%r1), 0
+0xe8 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mvcin 4095(1,%r15), 0
+0xe8 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mvcin 0(256,%r1), 0
+0xe8 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: mvcin 0(256,%r15), 0
+0xe8 0xff 0xf0 0x00 0x00 0x00
+
# CHECK: mvck 0(%r0), 0, %r0
0xd9 0x00 0x00 0x00 0x00 0x00
@@ -7759,6 +9010,69 @@
# CHECK: mvck 4095(%r15,%r1), 0(%r15), %r2
0xd9 0xf2 0x1f 0xff 0xf0 0x00
+# CHECK: mvcl %r0, %r8
+0x0e 0x08
+
+# CHECK: mvcl %r0, %r14
+0x0e 0x0e
+
+# CHECK: mvcl %r14, %r0
+0x0e 0xe0
+
+# CHECK: mvcl %r14, %r8
+0x0e 0xe8
+
+# CHECK: mvcle %r0, %r0, 0
+0xa8 0x00 0x00 0x00
+
+# CHECK: mvcle %r0, %r14, 4095
+0xa8 0x0e 0x0f 0xff
+
+# CHECK: mvcle %r0, %r0, 0(%r1)
+0xa8 0x00 0x10 0x00
+
+# CHECK: mvcle %r0, %r0, 0(%r15)
+0xa8 0x00 0xf0 0x00
+
+# CHECK: mvcle %r0, %r14, 4095(%r15)
+0xa8 0x0e 0xff 0xff
+
+# CHECK: mvcle %r0, %r0, 4095(%r1)
+0xa8 0x00 0x1f 0xff
+
+# CHECK: mvcle %r14, %r0, 0
+0xa8 0xe0 0x00 0x00
+
+# CHECK: mvclu %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x8e
+
+# CHECK: mvclu %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x8e
+
+# CHECK: mvclu %r0, %r14, 0
+0xeb 0x0e 0x00 0x00 0x00 0x8e
+
+# CHECK: mvclu %r0, %r14, 1
+0xeb 0x0e 0x00 0x01 0x00 0x8e
+
+# CHECK: mvclu %r0, %r8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x8e
+
+# CHECK: mvclu %r0, %r8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x8e
+
+# CHECK: mvclu %r0, %r4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x8e
+
+# CHECK: mvclu %r0, %r4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x8e
+
+# CHECK: mvclu %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x8e
+
+# CHECK: mvclu %r14, %r0, 0
+0xeb 0xe0 0x00 0x00 0x00 0x8e
+
# CHECK: mvghi 0, 0
0xe5 0x48 0x00 0x00 0x00 0x00
@@ -7909,6 +9223,84 @@
# CHECK: mviy 524287(%r15), 42
0xeb 0x2a 0xff 0xff 0x7f 0x52
+# CHECK: mvn 0(1), 0
+0xd1 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mvn 0(1), 0(%r1)
+0xd1 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mvn 0(1), 0(%r15)
+0xd1 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mvn 0(1), 4095
+0xd1 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mvn 0(1), 4095(%r1)
+0xd1 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mvn 0(1), 4095(%r15)
+0xd1 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mvn 0(1,%r1), 0
+0xd1 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mvn 0(1,%r15), 0
+0xd1 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mvn 4095(1,%r1), 0
+0xd1 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mvn 4095(1,%r15), 0
+0xd1 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mvn 0(256,%r1), 0
+0xd1 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: mvn 0(256,%r15), 0
+0xd1 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: mvo 0(1), 0(1)
+0xf1 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mvo 0(1), 0(1,%r1)
+0xf1 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mvo 0(1), 0(1,%r15)
+0xf1 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mvo 0(1), 4095(1)
+0xf1 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mvo 0(1), 4095(1,%r1)
+0xf1 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mvo 0(1), 4095(1,%r15)
+0xf1 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mvo 0(1,%r1), 0(1)
+0xf1 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mvo 0(1,%r15), 0(1)
+0xf1 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mvo 4095(1,%r1), 0(1)
+0xf1 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mvo 4095(1,%r15), 0(1)
+0xf1 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mvo 0(16,%r1), 0(1)
+0xf1 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: mvo 0(16,%r15), 0(1)
+0xf1 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: mvo 0(1), 0(16,%r1)
+0xf1 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: mvo 0(1), 0(16,%r15)
+0xf1 0x0f 0x00 0x00 0xf0 0x00
+
# CHECK: mvst %r0, %r0
0xb2 0x55 0x00 0x00
@@ -7921,6 +9313,42 @@
# CHECK: mvst %r7, %r8
0xb2 0x55 0x00 0x78
+# CHECK: mvz 0(1), 0
+0xd3 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mvz 0(1), 0(%r1)
+0xd3 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mvz 0(1), 0(%r15)
+0xd3 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mvz 0(1), 4095
+0xd3 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mvz 0(1), 4095(%r1)
+0xd3 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mvz 0(1), 4095(%r15)
+0xd3 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mvz 0(1,%r1), 0
+0xd3 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mvz 0(1,%r15), 0
+0xd3 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mvz 4095(1,%r1), 0
+0xd3 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mvz 4095(1,%r15), 0
+0xd3 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mvz 0(256,%r1), 0
+0xd3 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: mvz 0(256,%r15), 0
+0xd3 0xff 0xf0 0x00 0x00 0x00
+
# CHECK: mxbr %f0, %f0
0xb3 0x4c 0x00 0x00
@@ -7933,18 +9361,6 @@
# CHECK: mxbr %f13, %f13
0xb3 0x4c 0x00 0xdd
-# CHECK: mxdbr %f0, %f0
-0xb3 0x07 0x00 0x00
-
-# CHECK: mxdbr %f0, %f15
-0xb3 0x07 0x00 0x0f
-
-# CHECK: mxdbr %f8, %f8
-0xb3 0x07 0x00 0x88
-
-# CHECK: mxdbr %f13, %f0
-0xb3 0x07 0x00 0xd0
-
# CHECK: mxdb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x07
@@ -7966,6 +9382,39 @@
# CHECK: mxdb %f13, 0
0xed 0xd0 0x00 0x00 0x00 0x07
+# CHECK: mxdbr %f0, %f0
+0xb3 0x07 0x00 0x00
+
+# CHECK: mxdbr %f0, %f15
+0xb3 0x07 0x00 0x0f
+
+# CHECK: mxdbr %f8, %f8
+0xb3 0x07 0x00 0x88
+
+# CHECK: mxdbr %f13, %f0
+0xb3 0x07 0x00 0xd0
+
+# CHECK: n %r0, 0
+0x54 0x00 0x00 0x00
+
+# CHECK: n %r0, 4095
+0x54 0x00 0x0f 0xff
+
+# CHECK: n %r0, 0(%r1)
+0x54 0x00 0x10 0x00
+
+# CHECK: n %r0, 0(%r15)
+0x54 0x00 0xf0 0x00
+
+# CHECK: n %r0, 4095(%r1,%r15)
+0x54 0x01 0xff 0xff
+
+# CHECK: n %r0, 4095(%r15,%r1)
+0x54 0x0f 0x1f 0xff
+
+# CHECK: n %r15, 0
+0x54 0xf0 0x00 0x00
+
# CHECK: nc 0(1), 0
0xd4 0x00 0x00 0x00 0x00 0x00
@@ -8002,27 +9451,9 @@
# CHECK: nc 0(256,%r15), 0
0xd4 0xff 0xf0 0x00 0x00 0x00
-# CHECK: ngr %r0, %r0
-0xb9 0x80 0x00 0x00
-
-# CHECK: ngr %r0, %r15
-0xb9 0x80 0x00 0x0f
-
-# CHECK: ngr %r15, %r0
-0xb9 0x80 0x00 0xf0
-
-# CHECK: ngr %r7, %r8
-0xb9 0x80 0x00 0x78
-
# CHECK: ng %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x80
-# CHECK: ngrk %r0, %r0, %r0
-0xb9 0xe4 0x00 0x00
-
-# CHECK: ngrk %r2, %r3, %r4
-0xb9 0xe4 0x40 0x23
-
# CHECK: ng %r0, -1
0xe3 0x00 0x0f 0xff 0xff 0x80
@@ -8050,6 +9481,57 @@
# CHECK: ng %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x80
+# CHECK: ngr %r0, %r0
+0xb9 0x80 0x00 0x00
+
+# CHECK: ngr %r0, %r15
+0xb9 0x80 0x00 0x0f
+
+# CHECK: ngr %r15, %r0
+0xb9 0x80 0x00 0xf0
+
+# CHECK: ngr %r7, %r8
+0xb9 0x80 0x00 0x78
+
+# CHECK: ngrk %r0, %r0, %r0
+0xb9 0xe4 0x00 0x00
+
+# CHECK: ngrk %r2, %r3, %r4
+0xb9 0xe4 0x40 0x23
+
+# CHECK: ni 0, 0
+0x94 0x00 0x00 0x00
+
+# CHECK: ni 4095, 0
+0x94 0x00 0x0f 0xff
+
+# CHECK: ni 0, 255
+0x94 0xff 0x00 0x00
+
+# CHECK: ni 0(%r1), 42
+0x94 0x2a 0x10 0x00
+
+# CHECK: ni 0(%r15), 42
+0x94 0x2a 0xf0 0x00
+
+# CHECK: ni 4095(%r1), 42
+0x94 0x2a 0x1f 0xff
+
+# CHECK: ni 4095(%r15), 42
+0x94 0x2a 0xff 0xff
+
+# CHECK: niai 0, 0
+0xb2 0xfa 0x00 0x00
+
+# CHECK: niai 15, 0
+0xb2 0xfa 0x00 0xf0
+
+# CHECK: niai 0, 15
+0xb2 0xfa 0x00 0x0f
+
+# CHECK: niai 15, 15
+0xb2 0xfa 0x00 0xff
+
# CHECK: nihf %r0, 0
0xc0 0x0a 0x00 0x00 0x00 0x00
@@ -8116,27 +9598,6 @@
# CHECK: nill %r15, 0
0xa5 0xf7 0x00 0x00
-# CHECK: ni 0, 0
-0x94 0x00 0x00 0x00
-
-# CHECK: ni 4095, 0
-0x94 0x00 0x0f 0xff
-
-# CHECK: ni 0, 255
-0x94 0xff 0x00 0x00
-
-# CHECK: ni 0(%r1), 42
-0x94 0x2a 0x10 0x00
-
-# CHECK: ni 0(%r15), 42
-0x94 0x2a 0xf0 0x00
-
-# CHECK: ni 4095(%r1), 42
-0x94 0x2a 0x1f 0xff
-
-# CHECK: ni 4095(%r15), 42
-0x94 0x2a 0xff 0xff
-
# CHECK: niy -524288, 0
0xeb 0x00 0x00 0x00 0x80 0x54
@@ -8185,26 +9646,35 @@
# CHECK: nrk %r2, %r3, %r4
0xb9 0xf4 0x40 0x23
-# CHECK: n %r0, 0
-0x54 0x00 0x00 0x00
+# CHECK: ntstg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x25
-# CHECK: n %r0, 4095
-0x54 0x00 0x0f 0xff
+# CHECK: ntstg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x25
-# CHECK: n %r0, 0(%r1)
-0x54 0x00 0x10 0x00
+# CHECK: ntstg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x25
-# CHECK: n %r0, 0(%r15)
-0x54 0x00 0xf0 0x00
+# CHECK: ntstg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x25
-# CHECK: n %r0, 4095(%r1,%r15)
-0x54 0x01 0xff 0xff
+# CHECK: ntstg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x25
-# CHECK: n %r0, 4095(%r15,%r1)
-0x54 0x0f 0x1f 0xff
+# CHECK: ntstg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x25
-# CHECK: n %r15, 0
-0x54 0xf0 0x00 0x00
+# CHECK: ntstg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x25
+
+# CHECK: ntstg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x25
+
+# CHECK: ntstg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x25
+
+# CHECK: ntstg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x25
# CHECK: ny %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x54
@@ -8236,47 +9706,26 @@
# CHECK: ny %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x54
-# CHECK: niai 0, 0
-0xb2 0xfa 0x00 0x00
-
-# CHECK: niai 15, 0
-0xb2 0xfa 0x00 0xf0
-
-# CHECK: niai 0, 15
-0xb2 0xfa 0x00 0x0f
-
-# CHECK: niai 15, 15
-0xb2 0xfa 0x00 0xff
-
-# CHECK: ntstg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x25
-
-# CHECK: ntstg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x25
-
-# CHECK: ntstg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x25
-
-# CHECK: ntstg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x25
+# CHECK: o %r0, 0
+0x56 0x00 0x00 0x00
-# CHECK: ntstg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x25
+# CHECK: o %r0, 4095
+0x56 0x00 0x0f 0xff
-# CHECK: ntstg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x25
+# CHECK: o %r0, 0(%r1)
+0x56 0x00 0x10 0x00
-# CHECK: ntstg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x25
+# CHECK: o %r0, 0(%r15)
+0x56 0x00 0xf0 0x00
-# CHECK: ntstg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x25
+# CHECK: o %r0, 4095(%r1,%r15)
+0x56 0x01 0xff 0xff
-# CHECK: ntstg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x25
+# CHECK: o %r0, 4095(%r15,%r1)
+0x56 0x0f 0x1f 0xff
-# CHECK: ntstg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x25
+# CHECK: o %r15, 0
+0x56 0xf0 0x00 0x00
# CHECK: oc 0(1), 0
0xd6 0x00 0x00 0x00 0x00 0x00
@@ -8314,24 +9763,6 @@
# CHECK: oc 0(256,%r15), 0
0xd6 0xff 0xf0 0x00 0x00 0x00
-# CHECK: ogr %r0, %r0
-0xb9 0x81 0x00 0x00
-
-# CHECK: ogr %r0, %r15
-0xb9 0x81 0x00 0x0f
-
-# CHECK: ogr %r15, %r0
-0xb9 0x81 0x00 0xf0
-
-# CHECK: ogr %r7, %r8
-0xb9 0x81 0x00 0x78
-
-# CHECK: ogrk %r0, %r0, %r0
-0xb9 0xe6 0x00 0x00
-
-# CHECK: ogrk %r2, %r3, %r4
-0xb9 0xe6 0x40 0x23
-
# CHECK: og %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x81
@@ -8362,6 +9793,45 @@
# CHECK: og %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x81
+# CHECK: ogr %r0, %r0
+0xb9 0x81 0x00 0x00
+
+# CHECK: ogr %r0, %r15
+0xb9 0x81 0x00 0x0f
+
+# CHECK: ogr %r15, %r0
+0xb9 0x81 0x00 0xf0
+
+# CHECK: ogr %r7, %r8
+0xb9 0x81 0x00 0x78
+
+# CHECK: ogrk %r0, %r0, %r0
+0xb9 0xe6 0x00 0x00
+
+# CHECK: ogrk %r2, %r3, %r4
+0xb9 0xe6 0x40 0x23
+
+# CHECK: oi 0, 0
+0x96 0x00 0x00 0x00
+
+# CHECK: oi 4095, 0
+0x96 0x00 0x0f 0xff
+
+# CHECK: oi 0, 255
+0x96 0xff 0x00 0x00
+
+# CHECK: oi 0(%r1), 42
+0x96 0x2a 0x10 0x00
+
+# CHECK: oi 0(%r15), 42
+0x96 0x2a 0xf0 0x00
+
+# CHECK: oi 4095(%r1), 42
+0x96 0x2a 0x1f 0xff
+
+# CHECK: oi 4095(%r15), 42
+0x96 0x2a 0xff 0xff
+
# CHECK: oihf %r0, 0
0xc0 0x0c 0x00 0x00 0x00 0x00
@@ -8428,27 +9898,6 @@
# CHECK: oill %r15, 0
0xa5 0xfb 0x00 0x00
-# CHECK: oi 0, 0
-0x96 0x00 0x00 0x00
-
-# CHECK: oi 4095, 0
-0x96 0x00 0x0f 0xff
-
-# CHECK: oi 0, 255
-0x96 0xff 0x00 0x00
-
-# CHECK: oi 0(%r1), 42
-0x96 0x2a 0x10 0x00
-
-# CHECK: oi 0(%r15), 42
-0x96 0x2a 0xf0 0x00
-
-# CHECK: oi 4095(%r1), 42
-0x96 0x2a 0x1f 0xff
-
-# CHECK: oi 4095(%r15), 42
-0x96 0x2a 0xff 0xff
-
# CHECK: oiy -524288, 0
0xeb 0x00 0x00 0x00 0x80 0x56
@@ -8497,27 +9946,6 @@
# CHECK: ork %r2, %r3, %r4
0xb9 0xf6 0x40 0x23
-# CHECK: o %r0, 0
-0x56 0x00 0x00 0x00
-
-# CHECK: o %r0, 4095
-0x56 0x00 0x0f 0xff
-
-# CHECK: o %r0, 0(%r1)
-0x56 0x00 0x10 0x00
-
-# CHECK: o %r0, 0(%r15)
-0x56 0x00 0xf0 0x00
-
-# CHECK: o %r0, 4095(%r1,%r15)
-0x56 0x01 0xff 0xff
-
-# CHECK: o %r0, 4095(%r15,%r1)
-0x56 0x0f 0x1f 0xff
-
-# CHECK: o %r15, 0
-0x56 0xf0 0x00 0x00
-
# CHECK: oy %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x56
@@ -8548,6 +9976,51 @@
# CHECK: oy %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x56
+# CHECK: pack 0(1), 0(1)
+0xf2 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: pack 0(1), 0(1,%r1)
+0xf2 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: pack 0(1), 0(1,%r15)
+0xf2 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: pack 0(1), 4095(1)
+0xf2 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: pack 0(1), 4095(1,%r1)
+0xf2 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: pack 0(1), 4095(1,%r15)
+0xf2 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: pack 0(1,%r1), 0(1)
+0xf2 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: pack 0(1,%r15), 0(1)
+0xf2 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: pack 4095(1,%r1), 0(1)
+0xf2 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: pack 4095(1,%r15), 0(1)
+0xf2 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: pack 0(16,%r1), 0(1)
+0xf2 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: pack 0(16,%r15), 0(1)
+0xf2 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: pack 0(1), 0(16,%r1)
+0xf2 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: pack 0(1), 0(16,%r15)
+0xf2 0x0f 0x00 0x00 0xf0 0x00
+
+# CHECK: pcc
+0xb9 0x2c 0x00 0x00
+
# CHECK: pfd 0, -524288
0xe3 0x00 0x00 0x00 0x80 0x36
@@ -8578,6 +10051,78 @@
# CHECK: pfd 15, 0
0xe3 0xf0 0x00 0x00 0x00 0x36
+# CHECK: pka 0, 0(1)
+0xe9 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: pka 0, 0(1,%r1)
+0xe9 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: pka 0, 0(1,%r15)
+0xe9 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: pka 0, 4095(1)
+0xe9 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: pka 0, 4095(1,%r1)
+0xe9 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: pka 0, 4095(1,%r15)
+0xe9 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: pka 0(%r1), 0(1)
+0xe9 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: pka 0(%r15), 0(1)
+0xe9 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: pka 4095(%r1), 0(1)
+0xe9 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: pka 4095(%r15), 0(1)
+0xe9 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: pka 0, 0(256,%r1)
+0xe9 0xff 0x00 0x00 0x10 0x00
+
+# CHECK: pka 0, 0(256,%r15)
+0xe9 0xff 0x00 0x00 0xf0 0x00
+
+# CHECK: pku 0, 0(1)
+0xe1 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: pku 0, 0(1,%r1)
+0xe1 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: pku 0, 0(1,%r15)
+0xe1 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: pku 0, 4095(1)
+0xe1 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: pku 0, 4095(1,%r1)
+0xe1 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: pku 0, 4095(1,%r15)
+0xe1 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: pku 0(%r1), 0(1)
+0xe1 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: pku 0(%r15), 0(1)
+0xe1 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: pku 4095(%r1), 0(1)
+0xe1 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: pku 4095(%r15), 0(1)
+0xe1 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: pku 0, 0(256,%r1)
+0xe1 0xff 0x00 0x00 0x10 0x00
+
+# CHECK: pku 0, 0(256,%r15)
+0xe1 0xff 0x00 0x00 0xf0 0x00
+
# CHECK: plo %r0, 0, %r0, 0
0xee 0x00 0x00 0x00 0x00 0x00
@@ -8599,9 +10144,6 @@
# CHECK: plo %r2, 4095(%r1), %r4, 0(%r15)
0xee 0x24 0x1f 0xff 0xf0 0x00
-# CHECK: pr
-0x01 0x01
-
# CHECK: popcnt %r0, %r0
0xb9 0xe1 0x00 0x00
@@ -8629,6 +10171,9 @@
# CHECK: ppa %r15, %r0, 0
0xb2 0xe8 0x00 0xf0
+# CHECK: pr
+0x01 0x01
+
# CHECK: risbg %r0, %r0, 0, 0, 0
0xec 0x00 0x00 0x00 0x00 0x55
@@ -8713,6 +10258,78 @@
# CHECK: risblg %r4, %r5, 6, 7, 8
0xec 0x45 0x06 0x07 0x08 0x51
+# CHECK: rll %r0, %r0, 0
+0xeb 0x00 0x00 0x00 0x00 0x1d
+
+# CHECK: rll %r15, %r1, 0
+0xeb 0xf1 0x00 0x00 0x00 0x1d
+
+# CHECK: rll %r1, %r15, 0
+0xeb 0x1f 0x00 0x00 0x00 0x1d
+
+# CHECK: rll %r15, %r15, 0
+0xeb 0xff 0x00 0x00 0x00 0x1d
+
+# CHECK: rll %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x1d
+
+# CHECK: rll %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x1d
+
+# CHECK: rll %r0, %r0, 1
+0xeb 0x00 0x00 0x01 0x00 0x1d
+
+# CHECK: rll %r0, %r0, 524287
+0xeb 0x00 0x0f 0xff 0x7f 0x1d
+
+# CHECK: rll %r0, %r0, 0(%r1)
+0xeb 0x00 0x10 0x00 0x00 0x1d
+
+# CHECK: rll %r0, %r0, 0(%r15)
+0xeb 0x00 0xf0 0x00 0x00 0x1d
+
+# CHECK: rll %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x1d
+
+# CHECK: rll %r0, %r0, 524287(%r15)
+0xeb 0x00 0xff 0xff 0x7f 0x1d
+
+# CHECK: rllg %r0, %r0, 0
+0xeb 0x00 0x00 0x00 0x00 0x1c
+
+# CHECK: rllg %r15, %r1, 0
+0xeb 0xf1 0x00 0x00 0x00 0x1c
+
+# CHECK: rllg %r1, %r15, 0
+0xeb 0x1f 0x00 0x00 0x00 0x1c
+
+# CHECK: rllg %r15, %r15, 0
+0xeb 0xff 0x00 0x00 0x00 0x1c
+
+# CHECK: rllg %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x1c
+
+# CHECK: rllg %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x1c
+
+# CHECK: rllg %r0, %r0, 1
+0xeb 0x00 0x00 0x01 0x00 0x1c
+
+# CHECK: rllg %r0, %r0, 524287
+0xeb 0x00 0x0f 0xff 0x7f 0x1c
+
+# CHECK: rllg %r0, %r0, 0(%r1)
+0xeb 0x00 0x10 0x00 0x00 0x1c
+
+# CHECK: rllg %r0, %r0, 0(%r15)
+0xeb 0x00 0xf0 0x00 0x00 0x1c
+
+# CHECK: rllg %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x1c
+
+# CHECK: rllg %r0, %r0, 524287(%r15)
+0xeb 0x00 0xff 0xff 0x7f 0x1c
+
# CHECK: rnsbg %r0, %r0, 0, 0, 0
0xec 0x00 0x00 0x00 0x00 0x54
@@ -8776,77 +10393,35 @@
# CHECK: rxsbg %r4, %r5, 6, 7, 8
0xec 0x45 0x06 0x07 0x08 0x57
-# CHECK: rllg %r0, %r0, 0
-0xeb 0x00 0x00 0x00 0x00 0x1c
-
-# CHECK: rllg %r15, %r1, 0
-0xeb 0xf1 0x00 0x00 0x00 0x1c
-
-# CHECK: rllg %r1, %r15, 0
-0xeb 0x1f 0x00 0x00 0x00 0x1c
-
-# CHECK: rllg %r15, %r15, 0
-0xeb 0xff 0x00 0x00 0x00 0x1c
-
-# CHECK: rllg %r0, %r0, -524288
-0xeb 0x00 0x00 0x00 0x80 0x1c
-
-# CHECK: rllg %r0, %r0, -1
-0xeb 0x00 0x0f 0xff 0xff 0x1c
-
-# CHECK: rllg %r0, %r0, 1
-0xeb 0x00 0x00 0x01 0x00 0x1c
-
-# CHECK: rllg %r0, %r0, 524287
-0xeb 0x00 0x0f 0xff 0x7f 0x1c
-
-# CHECK: rllg %r0, %r0, 0(%r1)
-0xeb 0x00 0x10 0x00 0x00 0x1c
-
-# CHECK: rllg %r0, %r0, 0(%r15)
-0xeb 0x00 0xf0 0x00 0x00 0x1c
-
-# CHECK: rllg %r0, %r0, 524287(%r1)
-0xeb 0x00 0x1f 0xff 0x7f 0x1c
-
-# CHECK: rllg %r0, %r0, 524287(%r15)
-0xeb 0x00 0xff 0xff 0x7f 0x1c
-
-# CHECK: rll %r0, %r0, 0
-0xeb 0x00 0x00 0x00 0x00 0x1d
-
-# CHECK: rll %r15, %r1, 0
-0xeb 0xf1 0x00 0x00 0x00 0x1d
-
-# CHECK: rll %r1, %r15, 0
-0xeb 0x1f 0x00 0x00 0x00 0x1d
+# CHECK: s %r0, 0
+0x5b 0x00 0x00 0x00
-# CHECK: rll %r15, %r15, 0
-0xeb 0xff 0x00 0x00 0x00 0x1d
+# CHECK: s %r0, 4095
+0x5b 0x00 0x0f 0xff
-# CHECK: rll %r0, %r0, -524288
-0xeb 0x00 0x00 0x00 0x80 0x1d
+# CHECK: s %r0, 0(%r1)
+0x5b 0x00 0x10 0x00
-# CHECK: rll %r0, %r0, -1
-0xeb 0x00 0x0f 0xff 0xff 0x1d
+# CHECK: s %r0, 0(%r15)
+0x5b 0x00 0xf0 0x00
-# CHECK: rll %r0, %r0, 1
-0xeb 0x00 0x00 0x01 0x00 0x1d
+# CHECK: s %r0, 4095(%r1,%r15)
+0x5b 0x01 0xff 0xff
-# CHECK: rll %r0, %r0, 524287
-0xeb 0x00 0x0f 0xff 0x7f 0x1d
+# CHECK: s %r0, 4095(%r15,%r1)
+0x5b 0x0f 0x1f 0xff
-# CHECK: rll %r0, %r0, 0(%r1)
-0xeb 0x00 0x10 0x00 0x00 0x1d
+# CHECK: s %r15, 0
+0x5b 0xf0 0x00 0x00
-# CHECK: rll %r0, %r0, 0(%r15)
-0xeb 0x00 0xf0 0x00 0x00 0x1d
+# CHECK: sam24
+0x01 0x0c
-# CHECK: rll %r0, %r0, 524287(%r1)
-0xeb 0x00 0x1f 0xff 0x7f 0x1d
+# CHECK: sam31
+0x01 0x0d
-# CHECK: rll %r0, %r0, 524287(%r15)
-0xeb 0x00 0xff 0xff 0x7f 0x1d
+# CHECK: sam64
+0x01 0x0e
# CHECK: sar %a0, %r0
0xb2 0x4e 0x00 0x00
@@ -8863,27 +10438,6 @@
# CHECK: sar %a15, %r15
0xb2 0x4e 0x00 0xff
-# CHECK: sam24
-0x01 0x0c
-
-# CHECK: sam31
-0x01 0x0d
-
-# CHECK: sam64
-0x01 0x0e
-
-# CHECK: sdbr %f0, %f0
-0xb3 0x1b 0x00 0x00
-
-# CHECK: sdbr %f0, %f15
-0xb3 0x1b 0x00 0x0f
-
-# CHECK: sdbr %f7, %f8
-0xb3 0x1b 0x00 0x78
-
-# CHECK: sdbr %f15, %f0
-0xb3 0x1b 0x00 0xf0
-
# CHECK: sdb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x1b
@@ -8905,17 +10459,17 @@
# CHECK: sdb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x1b
-# CHECK: sebr %f0, %f0
-0xb3 0x0b 0x00 0x00
+# CHECK: sdbr %f0, %f0
+0xb3 0x1b 0x00 0x00
-# CHECK: sebr %f0, %f15
-0xb3 0x0b 0x00 0x0f
+# CHECK: sdbr %f0, %f15
+0xb3 0x1b 0x00 0x0f
-# CHECK: sebr %f7, %f8
-0xb3 0x0b 0x00 0x78
+# CHECK: sdbr %f7, %f8
+0xb3 0x1b 0x00 0x78
-# CHECK: sebr %f15, %f0
-0xb3 0x0b 0x00 0xf0
+# CHECK: sdbr %f15, %f0
+0xb3 0x1b 0x00 0xf0
# CHECK: seb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x0b
@@ -8938,6 +10492,18 @@
# CHECK: seb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x0b
+# CHECK: sebr %f0, %f0
+0xb3 0x0b 0x00 0x00
+
+# CHECK: sebr %f0, %f15
+0xb3 0x0b 0x00 0x0f
+
+# CHECK: sebr %f7, %f8
+0xb3 0x0b 0x00 0x78
+
+# CHECK: sebr %f15, %f0
+0xb3 0x0b 0x00 0xf0
+
# CHECK: sfasr %r0
0xb3 0x85 0x00 0x00
@@ -8956,17 +10522,35 @@
# CHECK: sfpc %r15
0xb3 0x84 0x00 0xf0
-# CHECK: sgfr %r0, %r0
-0xb9 0x19 0x00 0x00
+# CHECK: sg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x09
-# CHECK: sgfr %r0, %r15
-0xb9 0x19 0x00 0x0f
+# CHECK: sg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x09
-# CHECK: sgfr %r15, %r0
-0xb9 0x19 0x00 0xf0
+# CHECK: sg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x09
-# CHECK: sgfr %r7, %r8
-0xb9 0x19 0x00 0x78
+# CHECK: sg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x09
+
+# CHECK: sg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x09
+
+# CHECK: sg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x09
+
+# CHECK: sg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x09
+
+# CHECK: sg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x09
+
+# CHECK: sg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x09
+
+# CHECK: sg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x09
# CHECK: sgf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x19
@@ -8998,6 +10582,18 @@
# CHECK: sgf %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x19
+# CHECK: sgfr %r0, %r0
+0xb9 0x19 0x00 0x00
+
+# CHECK: sgfr %r0, %r15
+0xb9 0x19 0x00 0x0f
+
+# CHECK: sgfr %r15, %r0
+0xb9 0x19 0x00 0xf0
+
+# CHECK: sgfr %r7, %r8
+0xb9 0x19 0x00 0x78
+
# CHECK: sgr %r0, %r0
0xb9 0x09 0x00 0x00
@@ -9016,36 +10612,6 @@
# CHECK: sgrk %r2, %r3, %r4
0xb9 0xe9 0x40 0x23
-# CHECK: sg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x09
-
-# CHECK: sg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x09
-
-# CHECK: sg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x09
-
-# CHECK: sg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x09
-
-# CHECK: sg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x09
-
-# CHECK: sg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x09
-
-# CHECK: sg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x09
-
-# CHECK: sg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x09
-
-# CHECK: sg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x09
-
-# CHECK: sg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x09
-
# CHECK: sh %r0, 0
0x4b 0x00 0x00 0x00
@@ -9097,6 +10663,27 @@
# CHECK: shy %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x7b
+# CHECK: sl %r0, 0
+0x5f 0x00 0x00 0x00
+
+# CHECK: sl %r0, 4095
+0x5f 0x00 0x0f 0xff
+
+# CHECK: sl %r0, 0(%r1)
+0x5f 0x00 0x10 0x00
+
+# CHECK: sl %r0, 0(%r15)
+0x5f 0x00 0xf0 0x00
+
+# CHECK: sl %r0, 4095(%r1,%r15)
+0x5f 0x01 0xff 0xff
+
+# CHECK: sl %r0, 4095(%r15,%r1)
+0x5f 0x0f 0x1f 0xff
+
+# CHECK: sl %r15, 0
+0x5f 0xf0 0x00 0x00
+
# CHECK: sla %r0, 0
0x8b 0x00 0x00 0x00
@@ -9121,6 +10708,42 @@
# CHECK: sla %r0, 4095(%r15)
0x8b 0x00 0xff 0xff
+# CHECK: slag %r0, %r0, 0
+0xeb 0x00 0x00 0x00 0x00 0x0b
+
+# CHECK: slag %r15, %r1, 0
+0xeb 0xf1 0x00 0x00 0x00 0x0b
+
+# CHECK: slag %r1, %r15, 0
+0xeb 0x1f 0x00 0x00 0x00 0x0b
+
+# CHECK: slag %r15, %r15, 0
+0xeb 0xff 0x00 0x00 0x00 0x0b
+
+# CHECK: slag %r0, %r0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x0b
+
+# CHECK: slag %r0, %r0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x0b
+
+# CHECK: slag %r0, %r0, 1
+0xeb 0x00 0x00 0x01 0x00 0x0b
+
+# CHECK: slag %r0, %r0, 524287
+0xeb 0x00 0x0f 0xff 0x7f 0x0b
+
+# CHECK: slag %r0, %r0, 0(%r1)
+0xeb 0x00 0x10 0x00 0x00 0x0b
+
+# CHECK: slag %r0, %r0, 0(%r15)
+0xeb 0x00 0xf0 0x00 0x00 0x0b
+
+# CHECK: slag %r0, %r0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x0b
+
+# CHECK: slag %r0, %r0, 524287(%r15)
+0xeb 0x00 0xff 0xff 0x7f 0x0b
+
# CHECK: slak %r0, %r0, 0
0xeb 0x00 0x00 0x00 0x00 0xdd
@@ -9157,17 +10780,35 @@
# CHECK: slak %r0, %r0, 524287(%r15)
0xeb 0x00 0xff 0xff 0x7f 0xdd
-# CHECK: slbgr %r0, %r0
-0xb9 0x89 0x00 0x00
+# CHECK: slb %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x99
-# CHECK: slbgr %r0, %r15
-0xb9 0x89 0x00 0x0f
+# CHECK: slb %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x99
-# CHECK: slbgr %r15, %r0
-0xb9 0x89 0x00 0xf0
+# CHECK: slb %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x99
-# CHECK: slbgr %r7, %r8
-0xb9 0x89 0x00 0x78
+# CHECK: slb %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x99
+
+# CHECK: slb %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x99
+
+# CHECK: slb %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x99
+
+# CHECK: slb %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x99
+
+# CHECK: slb %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x99
+
+# CHECK: slb %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x99
+
+# CHECK: slb %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x99
# CHECK: slbg %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x89
@@ -9199,6 +10840,18 @@
# CHECK: slbg %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x89
+# CHECK: slbgr %r0, %r0
+0xb9 0x89 0x00 0x00
+
+# CHECK: slbgr %r0, %r15
+0xb9 0x89 0x00 0x0f
+
+# CHECK: slbgr %r15, %r0
+0xb9 0x89 0x00 0xf0
+
+# CHECK: slbgr %r7, %r8
+0xb9 0x89 0x00 0x78
+
# CHECK: slbr %r0, %r0
0xb9 0x99 0x00 0x00
@@ -9211,35 +10864,53 @@
# CHECK: slbr %r7, %r8
0xb9 0x99 0x00 0x78
-# CHECK: slb %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x99
+# CHECK: slda %r0, 0
+0x8f 0x00 0x00 0x00
-# CHECK: slb %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x99
+# CHECK: slda %r6, 0
+0x8f 0x60 0x00 0x00
-# CHECK: slb %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x99
+# CHECK: slda %r14, 0
+0x8f 0xe0 0x00 0x00
-# CHECK: slb %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x99
+# CHECK: slda %r0, 4095
+0x8f 0x00 0x0f 0xff
-# CHECK: slb %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x99
+# CHECK: slda %r0, 0(%r1)
+0x8f 0x00 0x10 0x00
-# CHECK: slb %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x99
+# CHECK: slda %r0, 0(%r15)
+0x8f 0x00 0xf0 0x00
-# CHECK: slb %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x99
+# CHECK: slda %r0, 4095(%r1)
+0x8f 0x00 0x1f 0xff
-# CHECK: slb %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x99
+# CHECK: slda %r0, 4095(%r15)
+0x8f 0x00 0xff 0xff
-# CHECK: slb %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x99
+# CHECK: sldl %r0, 0
+0x8d 0x00 0x00 0x00
-# CHECK: slb %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x99
+# CHECK: sldl %r6, 0
+0x8d 0x60 0x00 0x00
+
+# CHECK: sldl %r14, 0
+0x8d 0xe0 0x00 0x00
+
+# CHECK: sldl %r0, 4095
+0x8d 0x00 0x0f 0xff
+
+# CHECK: sldl %r0, 0(%r1)
+0x8d 0x00 0x10 0x00
+
+# CHECK: sldl %r0, 0(%r15)
+0x8d 0x00 0xf0 0x00
+
+# CHECK: sldl %r0, 4095(%r1)
+0x8d 0x00 0x1f 0xff
+
+# CHECK: sldl %r0, 4095(%r15)
+0x8d 0x00 0xff 0xff
# CHECK: slfi %r0, 0
0xc2 0x05 0x00 0x00 0x00 0x00
@@ -9250,26 +10921,35 @@
# CHECK: slfi %r15, 0
0xc2 0xf5 0x00 0x00 0x00 0x00
-# CHECK: slgfi %r0, 0
-0xc2 0x04 0x00 0x00 0x00 0x00
+# CHECK: slg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x0b
-# CHECK: slgfi %r0, 4294967295
-0xc2 0x04 0xff 0xff 0xff 0xff
+# CHECK: slg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x0b
-# CHECK: slgfi %r15, 0
-0xc2 0xf4 0x00 0x00 0x00 0x00
+# CHECK: slg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x0b
-# CHECK: slgfr %r0, %r0
-0xb9 0x1b 0x00 0x00
+# CHECK: slg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x0b
-# CHECK: slgfr %r0, %r15
-0xb9 0x1b 0x00 0x0f
+# CHECK: slg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x0b
-# CHECK: slgfr %r15, %r0
-0xb9 0x1b 0x00 0xf0
+# CHECK: slg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x0b
-# CHECK: slgfr %r7, %r8
-0xb9 0x1b 0x00 0x78
+# CHECK: slg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x0b
+
+# CHECK: slg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x0b
+
+# CHECK: slg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x0b
+
+# CHECK: slg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x0b
# CHECK: slgf %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x1b
@@ -9301,6 +10981,27 @@
# CHECK: slgf %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x1b
+# CHECK: slgfi %r0, 0
+0xc2 0x04 0x00 0x00 0x00 0x00
+
+# CHECK: slgfi %r0, 4294967295
+0xc2 0x04 0xff 0xff 0xff 0xff
+
+# CHECK: slgfi %r15, 0
+0xc2 0xf4 0x00 0x00 0x00 0x00
+
+# CHECK: slgfr %r0, %r0
+0xb9 0x1b 0x00 0x00
+
+# CHECK: slgfr %r0, %r15
+0xb9 0x1b 0x00 0x0f
+
+# CHECK: slgfr %r15, %r0
+0xb9 0x1b 0x00 0xf0
+
+# CHECK: slgfr %r7, %r8
+0xb9 0x1b 0x00 0x78
+
# CHECK: slgr %r0, %r0
0xb9 0x0b 0x00 0x00
@@ -9319,35 +11020,29 @@
# CHECK: slgrk %r2, %r3, %r4
0xb9 0xeb 0x40 0x23
-# CHECK: slg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x0b
-
-# CHECK: slg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x0b
-
-# CHECK: slg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x0b
+# CHECK: sll %r0, 0
+0x89 0x00 0x00 0x00
-# CHECK: slg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x0b
+# CHECK: sll %r7, 0
+0x89 0x70 0x00 0x00
-# CHECK: slg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x0b
+# CHECK: sll %r15, 0
+0x89 0xf0 0x00 0x00
-# CHECK: slg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x0b
+# CHECK: sll %r0, 4095
+0x89 0x00 0x0f 0xff
-# CHECK: slg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x0b
+# CHECK: sll %r0, 0(%r1)
+0x89 0x00 0x10 0x00
-# CHECK: slg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x0b
+# CHECK: sll %r0, 0(%r15)
+0x89 0x00 0xf0 0x00
-# CHECK: slg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x0b
+# CHECK: sll %r0, 4095(%r1)
+0x89 0x00 0x1f 0xff
-# CHECK: slg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x0b
+# CHECK: sll %r0, 4095(%r15)
+0x89 0x00 0xff 0xff
# CHECK: sllg %r0, %r0, 0
0xeb 0x00 0x00 0x00 0x00 0x0d
@@ -9421,30 +11116,6 @@
# CHECK: sllk %r0, %r0, 524287(%r15)
0xeb 0x00 0xff 0xff 0x7f 0xdf
-# CHECK: sll %r0, 0
-0x89 0x00 0x00 0x00
-
-# CHECK: sll %r7, 0
-0x89 0x70 0x00 0x00
-
-# CHECK: sll %r15, 0
-0x89 0xf0 0x00 0x00
-
-# CHECK: sll %r0, 4095
-0x89 0x00 0x0f 0xff
-
-# CHECK: sll %r0, 0(%r1)
-0x89 0x00 0x10 0x00
-
-# CHECK: sll %r0, 0(%r15)
-0x89 0x00 0xf0 0x00
-
-# CHECK: sll %r0, 4095(%r1)
-0x89 0x00 0x1f 0xff
-
-# CHECK: sll %r0, 4095(%r15)
-0x89 0x00 0xff 0xff
-
# CHECK: slr %r0, %r0
0x1f 0x00
@@ -9463,27 +11134,6 @@
# CHECK: slrk %r2, %r3, %r4
0xb9 0xfb 0x40 0x23
-# CHECK: sl %r0, 0
-0x5f 0x00 0x00 0x00
-
-# CHECK: sl %r0, 4095
-0x5f 0x00 0x0f 0xff
-
-# CHECK: sl %r0, 0(%r1)
-0x5f 0x00 0x10 0x00
-
-# CHECK: sl %r0, 0(%r15)
-0x5f 0x00 0xf0 0x00
-
-# CHECK: sl %r0, 4095(%r1,%r15)
-0x5f 0x01 0xff 0xff
-
-# CHECK: sl %r0, 4095(%r15,%r1)
-0x5f 0x0f 0x1f 0xff
-
-# CHECK: sl %r15, 0
-0x5f 0xf0 0x00 0x00
-
# CHECK: sly %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x5f
@@ -9514,8 +11164,47 @@
# CHECK: sly %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x5f
-# CHECK: sqdbr %f0, %f0
-0xb3 0x15 0x00 0x00
+# CHECK: sp 0(1), 0(1)
+0xfb 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: sp 0(1), 0(1,%r1)
+0xfb 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: sp 0(1), 0(1,%r15)
+0xfb 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: sp 0(1), 4095(1)
+0xfb 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: sp 0(1), 4095(1,%r1)
+0xfb 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: sp 0(1), 4095(1,%r15)
+0xfb 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: sp 0(1,%r1), 0(1)
+0xfb 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: sp 0(1,%r15), 0(1)
+0xfb 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: sp 4095(1,%r1), 0(1)
+0xfb 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: sp 4095(1,%r15), 0(1)
+0xfb 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: sp 0(16,%r1), 0(1)
+0xfb 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: sp 0(16,%r15), 0(1)
+0xfb 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: sp 0(1), 0(16,%r1)
+0xfb 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: sp 0(1), 0(16,%r15)
+0xfb 0x0f 0x00 0x00 0xf0 0x00
# CHECK: spm %r0
0x04 0x00
@@ -9526,15 +11215,6 @@
# CHECK: spm %r15
0x04 0xf0
-# CHECK: sqdbr %f0, %f15
-0xb3 0x15 0x00 0x0f
-
-# CHECK: sqdbr %f7, %f8
-0xb3 0x15 0x00 0x78
-
-# CHECK: sqdbr %f15, %f0
-0xb3 0x15 0x00 0xf0
-
# CHECK: sqdb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x15
@@ -9556,17 +11236,17 @@
# CHECK: sqdb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x15
-# CHECK: sqebr %f0, %f0
-0xb3 0x14 0x00 0x00
+# CHECK: sqdbr %f0, %f0
+0xb3 0x15 0x00 0x00
-# CHECK: sqebr %f0, %f15
-0xb3 0x14 0x00 0x0f
+# CHECK: sqdbr %f0, %f15
+0xb3 0x15 0x00 0x0f
-# CHECK: sqebr %f7, %f8
-0xb3 0x14 0x00 0x78
+# CHECK: sqdbr %f7, %f8
+0xb3 0x15 0x00 0x78
-# CHECK: sqebr %f15, %f0
-0xb3 0x14 0x00 0xf0
+# CHECK: sqdbr %f15, %f0
+0xb3 0x15 0x00 0xf0
# CHECK: sqeb %f0, 0
0xed 0x00 0x00 0x00 0x00 0x14
@@ -9589,6 +11269,18 @@
# CHECK: sqeb %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x14
+# CHECK: sqebr %f0, %f0
+0xb3 0x14 0x00 0x00
+
+# CHECK: sqebr %f0, %f15
+0xb3 0x14 0x00 0x0f
+
+# CHECK: sqebr %f7, %f8
+0xb3 0x14 0x00 0x78
+
+# CHECK: sqebr %f15, %f0
+0xb3 0x14 0x00 0xf0
+
# CHECK: sqxbr %f0, %f0
0xb3 0x16 0x00 0x00
@@ -9601,6 +11293,42 @@
# CHECK: sqxbr %f13, %f0
0xb3 0x16 0x00 0xd0
+# CHECK: sr %r0, %r0
+0x1b 0x00
+
+# CHECK: sr %r0, %r15
+0x1b 0x0f
+
+# CHECK: sr %r15, %r0
+0x1b 0xf0
+
+# CHECK: sr %r7, %r8
+0x1b 0x78
+
+# CHECK: sra %r0, 0
+0x8a 0x00 0x00 0x00
+
+# CHECK: sra %r7, 0
+0x8a 0x70 0x00 0x00
+
+# CHECK: sra %r15, 0
+0x8a 0xf0 0x00 0x00
+
+# CHECK: sra %r0, 4095
+0x8a 0x00 0x0f 0xff
+
+# CHECK: sra %r0, 0(%r1)
+0x8a 0x00 0x10 0x00
+
+# CHECK: sra %r0, 0(%r15)
+0x8a 0x00 0xf0 0x00
+
+# CHECK: sra %r0, 4095(%r1)
+0x8a 0x00 0x1f 0xff
+
+# CHECK: sra %r0, 4095(%r15)
+0x8a 0x00 0xff 0xff
+
# CHECK: srag %r0, %r0, 0
0xeb 0x00 0x00 0x00 0x00 0x0a
@@ -9673,29 +11401,83 @@
# CHECK: srak %r0, %r0, 524287(%r15)
0xeb 0x00 0xff 0xff 0x7f 0xdc
-# CHECK: sra %r0, 0
-0x8a 0x00 0x00 0x00
+# CHECK: srda %r0, 0
+0x8e 0x00 0x00 0x00
-# CHECK: sra %r7, 0
-0x8a 0x70 0x00 0x00
+# CHECK: srda %r6, 0
+0x8e 0x60 0x00 0x00
-# CHECK: sra %r15, 0
-0x8a 0xf0 0x00 0x00
+# CHECK: srda %r14, 0
+0x8e 0xe0 0x00 0x00
-# CHECK: sra %r0, 4095
-0x8a 0x00 0x0f 0xff
+# CHECK: srda %r0, 4095
+0x8e 0x00 0x0f 0xff
-# CHECK: sra %r0, 0(%r1)
-0x8a 0x00 0x10 0x00
+# CHECK: srda %r0, 0(%r1)
+0x8e 0x00 0x10 0x00
-# CHECK: sra %r0, 0(%r15)
-0x8a 0x00 0xf0 0x00
+# CHECK: srda %r0, 0(%r15)
+0x8e 0x00 0xf0 0x00
-# CHECK: sra %r0, 4095(%r1)
-0x8a 0x00 0x1f 0xff
+# CHECK: srda %r0, 4095(%r1)
+0x8e 0x00 0x1f 0xff
-# CHECK: sra %r0, 4095(%r15)
-0x8a 0x00 0xff 0xff
+# CHECK: srda %r0, 4095(%r15)
+0x8e 0x00 0xff 0xff
+
+# CHECK: srdl %r0, 0
+0x8c 0x00 0x00 0x00
+
+# CHECK: srdl %r6, 0
+0x8c 0x60 0x00 0x00
+
+# CHECK: srdl %r14, 0
+0x8c 0xe0 0x00 0x00
+
+# CHECK: srdl %r0, 4095
+0x8c 0x00 0x0f 0xff
+
+# CHECK: srdl %r0, 0(%r1)
+0x8c 0x00 0x10 0x00
+
+# CHECK: srdl %r0, 0(%r15)
+0x8c 0x00 0xf0 0x00
+
+# CHECK: srdl %r0, 4095(%r1)
+0x8c 0x00 0x1f 0xff
+
+# CHECK: srdl %r0, 4095(%r15)
+0x8c 0x00 0xff 0xff
+
+# CHECK: srk %r0, %r0, %r0
+0xb9 0xf9 0x00 0x00
+
+# CHECK: srk %r2, %r3, %r4
+0xb9 0xf9 0x40 0x23
+
+# CHECK: srl %r0, 0
+0x88 0x00 0x00 0x00
+
+# CHECK: srl %r7, 0
+0x88 0x70 0x00 0x00
+
+# CHECK: srl %r15, 0
+0x88 0xf0 0x00 0x00
+
+# CHECK: srl %r0, 4095
+0x88 0x00 0x0f 0xff
+
+# CHECK: srl %r0, 0(%r1)
+0x88 0x00 0x10 0x00
+
+# CHECK: srl %r0, 0(%r15)
+0x88 0x00 0xf0 0x00
+
+# CHECK: srl %r0, 4095(%r1)
+0x88 0x00 0x1f 0xff
+
+# CHECK: srl %r0, 4095(%r15)
+0x88 0x00 0xff 0xff
# CHECK: srlg %r0, %r0, 0
0xeb 0x00 0x00 0x00 0x00 0x0c
@@ -9769,48 +11551,6 @@
# CHECK: srlk %r0, %r0, 524287(%r15)
0xeb 0x00 0xff 0xff 0x7f 0xde
-# CHECK: srl %r0, 0
-0x88 0x00 0x00 0x00
-
-# CHECK: srl %r7, 0
-0x88 0x70 0x00 0x00
-
-# CHECK: srl %r15, 0
-0x88 0xf0 0x00 0x00
-
-# CHECK: srl %r0, 4095
-0x88 0x00 0x0f 0xff
-
-# CHECK: srl %r0, 0(%r1)
-0x88 0x00 0x10 0x00
-
-# CHECK: srl %r0, 0(%r15)
-0x88 0x00 0xf0 0x00
-
-# CHECK: srl %r0, 4095(%r1)
-0x88 0x00 0x1f 0xff
-
-# CHECK: srl %r0, 4095(%r15)
-0x88 0x00 0xff 0xff
-
-# CHECK: sr %r0, %r0
-0x1b 0x00
-
-# CHECK: sr %r0, %r15
-0x1b 0x0f
-
-# CHECK: sr %r15, %r0
-0x1b 0xf0
-
-# CHECK: sr %r7, %r8
-0x1b 0x78
-
-# CHECK: srk %r0, %r0, %r0
-0xb9 0xf9 0x00 0x00
-
-# CHECK: srk %r2, %r3, %r4
-0xb9 0xf9 0x40 0x23
-
# CHECK: srnm 0
0xb2 0x99 0x00 0x00
@@ -9865,6 +11605,45 @@
# CHECK: srnmt 4095(%r15)
0xb2 0xb9 0xff 0xff
+# CHECK: srp 0(1), 0, 0
+0xf0 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: srp 0(1), 0, 15
+0xf0 0x0f 0x00 0x00 0x00 0x00
+
+# CHECK: srp 0(1), 0(%r1), 0
+0xf0 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: srp 0(1), 0(%r15), 0
+0xf0 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: srp 0(1), 4095, 0
+0xf0 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: srp 0(1), 4095(%r1), 0
+0xf0 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: srp 0(1), 4095(%r15), 0
+0xf0 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: srp 0(1,%r1), 0, 0
+0xf0 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: srp 0(1,%r15), 0, 0
+0xf0 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: srp 4095(1,%r1), 0, 0
+0xf0 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: srp 4095(1,%r15), 0, 0
+0xf0 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: srp 0(16,%r1), 0, 0
+0xf0 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: srp 0(16,%r15), 0, 0
+0xf0 0xf0 0xf0 0x00 0x00 0x00
+
# CHECK: srst %r0, %r0
0xb2 0x5e 0x00 0x00
@@ -9877,6 +11656,39 @@
# CHECK: srst %r7, %r8
0xb2 0x5e 0x00 0x78
+# CHECK: srstu %r0, %r0
+0xb9 0xbe 0x00 0x00
+
+# CHECK: srstu %r0, %r15
+0xb9 0xbe 0x00 0x0f
+
+# CHECK: srstu %r15, %r0
+0xb9 0xbe 0x00 0xf0
+
+# CHECK: srstu %r7, %r8
+0xb9 0xbe 0x00 0x78
+
+# CHECK: st %r0, 0
+0x50 0x00 0x00 0x00
+
+# CHECK: st %r0, 4095
+0x50 0x00 0x0f 0xff
+
+# CHECK: st %r0, 0(%r1)
+0x50 0x00 0x10 0x00
+
+# CHECK: st %r0, 0(%r15)
+0x50 0x00 0xf0 0x00
+
+# CHECK: st %r0, 4095(%r1,%r15)
+0x50 0x01 0xff 0xff
+
+# CHECK: st %r0, 4095(%r15,%r1)
+0x50 0x0f 0x1f 0xff
+
+# CHECK: st %r15, 0
+0x50 0xf0 0x00 0x00
+
# CHECK: stam %a0, %a0, 0
0x9b 0x00 0x00 0x00
@@ -9997,82 +11809,145 @@
# CHECK: stch %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0xc3
-# CHECK: stcy %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x72
-
# CHECK: stck 0
0xb2 0x05 0x00 0x00
# CHECK: stck 0(%r1)
0xb2 0x05 0x10 0x00
-#CHECK: stck 0(%r15)
+# CHECK: stck 0(%r15)
0xb2 0x05 0xf0 0x00
-#CHECK: stck 4095
-0xb2 0x05 0x0f 0xff
+# CHECK: stck 4095
+0xb2 0x05 0x0f 0xff
-#CHECK: stck 4095(%r1)
+# CHECK: stck 4095(%r1)
0xb2 0x05 0x1f 0xff
-#CHECK: stck 4095(%r15)
+# CHECK: stck 4095(%r15)
0xb2 0x05 0xff 0xff
+# CHECK: stcke 0
+0xb2 0x78 0x00 0x00
+
+# CHECK: stcke 0(%r1)
+0xb2 0x78 0x10 0x00
+
+# CHECK: stcke 0(%r15)
+0xb2 0x78 0xf0 0x00
+
+# CHECK: stcke 4095
+0xb2 0x78 0x0f 0xff
+
+# CHECK: stcke 4095(%r1)
+0xb2 0x78 0x1f 0xff
+
+# CHECK: stcke 4095(%r15)
+0xb2 0x78 0xff 0xff
+
# CHECK: stckf 0
0xb2 0x7c 0x00 0x00
# CHECK: stckf 0(%r1)
0xb2 0x7c 0x10 0x00
-#CHECK: stckf 0(%r15)
+# CHECK: stckf 0(%r15)
0xb2 0x7c 0xf0 0x00
-#CHECK: stckf 4095
-0xb2 0x7c 0x0f 0xff
+# CHECK: stckf 4095
+0xb2 0x7c 0x0f 0xff
-#CHECK: stckf 4095(%r1)
+# CHECK: stckf 4095(%r1)
0xb2 0x7c 0x1f 0xff
-#CHECK: stckf 4095(%r15)
+# CHECK: stckf 4095(%r15)
0xb2 0x7c 0xff 0xff
-# CHECK: stcke 0
-0xb2 0x78 0x00 0x00
+# CHECK: stcm %r0, 0, 0
+0xbe 0x00 0x00 0x00
-# CHECK: stcke 0(%r1)
-0xb2 0x78 0x10 0x00
+# CHECK: stcm %r0, 15, 4095
+0xbe 0x0f 0x0f 0xff
-#CHECK: stcke 0(%r15)
-0xb2 0x78 0xf0 0x00
+# CHECK: stcm %r0, 0, 0(%r1)
+0xbe 0x00 0x10 0x00
-#CHECK: stcke 4095
-0xb2 0x78 0x0f 0xff
+# CHECK: stcm %r0, 0, 0(%r15)
+0xbe 0x00 0xf0 0x00
-#CHECK: stcke 4095(%r1)
-0xb2 0x78 0x1f 0xff
+# CHECK: stcm %r0, 15, 4095(%r15)
+0xbe 0x0f 0xff 0xff
-#CHECK: stcke 4095(%r15)
-0xb2 0x78 0xff 0xff
+# CHECK: stcm %r0, 0, 4095(%r1)
+0xbe 0x00 0x1f 0xff
-# CHECK: stfle 0
-0xb2 0xb0 0x00 0x00
+# CHECK: stcm %r15, 0, 0
+0xbe 0xf0 0x00 0x00
-# CHECK: stfle 0(%r1)
-0xb2 0xb0 0x10 0x00
+# CHECK: stcmh %r0, 0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x2c
-#CHECK: stfle 0(%r15)
-0xb2 0xb0 0xf0 0x00
+# CHECK: stcmh %r0, 0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x2c
-#CHECK: stfle 4095
-0xb2 0xb0 0x0f 0xff
+# CHECK: stcmh %r0, 15, 0
+0xeb 0x0f 0x00 0x00 0x00 0x2c
-#CHECK: stfle 4095(%r1)
-0xb2 0xb0 0x1f 0xff
+# CHECK: stcmh %r0, 15, 1
+0xeb 0x0f 0x00 0x01 0x00 0x2c
-#CHECK: stfle 4095(%r15)
-0xb2 0xb0 0xff 0xff
+# CHECK: stcmh %r0, 8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x2c
+
+# CHECK: stcmh %r0, 8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x2c
+
+# CHECK: stcmh %r0, 4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x2c
+
+# CHECK: stcmh %r0, 4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x2c
+
+# CHECK: stcmh %r0, 0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x2c
+
+# CHECK: stcmh %r15, 0, 0
+0xeb 0xf0 0x00 0x00 0x00 0x2c
+
+# CHECK: stcmy %r0, 0, -524288
+0xeb 0x00 0x00 0x00 0x80 0x2d
-# CHECK: stcy %r0, -1
+# CHECK: stcmy %r0, 0, -1
+0xeb 0x00 0x0f 0xff 0xff 0x2d
+
+# CHECK: stcmy %r0, 15, 0
+0xeb 0x0f 0x00 0x00 0x00 0x2d
+
+# CHECK: stcmy %r0, 15, 1
+0xeb 0x0f 0x00 0x01 0x00 0x2d
+
+# CHECK: stcmy %r0, 8, 524287
+0xeb 0x08 0x0f 0xff 0x7f 0x2d
+
+# CHECK: stcmy %r0, 8, 0(%r1)
+0xeb 0x08 0x10 0x00 0x00 0x2d
+
+# CHECK: stcmy %r0, 4, 0(%r15)
+0xeb 0x04 0xf0 0x00 0x00 0x2d
+
+# CHECK: stcmy %r0, 4, 524287(%r15)
+0xeb 0x04 0xff 0xff 0x7f 0x2d
+
+# CHECK: stcmy %r0, 0, 524287(%r1)
+0xeb 0x00 0x1f 0xff 0x7f 0x2d
+
+# CHECK: stcmy %r15, 0, 0
+0xeb 0xf0 0x00 0x00 0x00 0x2d
+
+# CHECK: stcy %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x72
+
+# CHECK: stcy %r0, -1
0xe3 0x00 0x0f 0xff 0xff 0x72
# CHECK: stcy %r0, 0
@@ -10201,6 +12076,54 @@
# CHECK: stey %f15, 0
0xed 0xf0 0x00 0x00 0x00 0x66
+# CHECK: stfh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xcb
+
+# CHECK: stfh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xcb
+
+# CHECK: stfh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xcb
+
+# CHECK: stfh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xcb
+
+# CHECK: stfh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xcb
+
+# CHECK: stfh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xcb
+
+# CHECK: stfh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xcb
+
+# CHECK: stfh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xcb
+
+# CHECK: stfh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xcb
+
+# CHECK: stfh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xcb
+
+# CHECK: stfle 0
+0xb2 0xb0 0x00 0x00
+
+# CHECK: stfle 0(%r1)
+0xb2 0xb0 0x10 0x00
+
+# CHECK: stfle 0(%r15)
+0xb2 0xb0 0xf0 0x00
+
+# CHECK: stfle 4095
+0xb2 0xb0 0x0f 0xff
+
+# CHECK: stfle 4095(%r1)
+0xb2 0xb0 0x1f 0xff
+
+# CHECK: stfle 4095(%r15)
+0xb2 0xb0 0xff 0xff
+
# CHECK: stfpc 0
0xb2 0x9c 0x00 0x00
@@ -10300,36 +12223,6 @@
# CHECK: sthh %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0xc7
-# CHECK: stfh %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0xcb
-
-# CHECK: stfh %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0xcb
-
-# CHECK: stfh %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0xcb
-
-# CHECK: stfh %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0xcb
-
-# CHECK: stfh %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0xcb
-
-# CHECK: stfh %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0xcb
-
-# CHECK: stfh %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0xcb
-
-# CHECK: stfh %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0xcb
-
-# CHECK: stfh %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0xcb
-
-# CHECK: stfh %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0xcb
-
# CHECK: sthy %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x70
@@ -10507,135 +12400,6 @@
# CHECK: stmy %r0, %r0, 524287(%r15)
0xeb 0x00 0xff 0xff 0x7f 0x90
-# CHECK: strag 0, 0
-0xe5 0x02 0x00 0x00 0x00 0x00
-
-# CHECK: strag 0, 4095
-0xe5 0x02 0x00 0x00 0x0f 0xff
-
-# CHECK: strag 0, 0(%r1)
-0xe5 0x02 0x00 0x00 0x10 0x00
-
-# CHECK: strag 0, 0(%r15)
-0xe5 0x02 0x00 0x00 0xf0 0x00
-
-# CHECK: strag 0(%r1), 4095(%r15)
-0xe5 0x02 0x10 0x00 0xff 0xff
-
-# CHECK: strag 4095(%r1), 0(%r15)
-0xe5 0x02 0x1f 0xff 0xf0 0x00
-
-# CHECK: strvg %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x2f
-
-# CHECK: strvg %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x2f
-
-# CHECK: strvg %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x2f
-
-# CHECK: strvg %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x2f
-
-# CHECK: strvg %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x2f
-
-# CHECK: strvg %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x2f
-
-# CHECK: strvg %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x2f
-
-# CHECK: strvg %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x2f
-
-# CHECK: strvg %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x2f
-
-# CHECK: strvg %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x2f
-
-# CHECK: strvh %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x3f
-
-# CHECK: strvh %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x3f
-
-# CHECK: strvh %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x3f
-
-# CHECK: strvh %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x3f
-
-# CHECK: strvh %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x3f
-
-# CHECK: strvh %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x3f
-
-# CHECK: strvh %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x3f
-
-# CHECK: strvh %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x3f
-
-# CHECK: strvh %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x3f
-
-# CHECK: strvh %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x3f
-
-# CHECK: strv %r0, -524288
-0xe3 0x00 0x00 0x00 0x80 0x3e
-
-# CHECK: strv %r0, -1
-0xe3 0x00 0x0f 0xff 0xff 0x3e
-
-# CHECK: strv %r0, 0
-0xe3 0x00 0x00 0x00 0x00 0x3e
-
-# CHECK: strv %r0, 1
-0xe3 0x00 0x00 0x01 0x00 0x3e
-
-# CHECK: strv %r0, 524287
-0xe3 0x00 0x0f 0xff 0x7f 0x3e
-
-# CHECK: strv %r0, 0(%r1)
-0xe3 0x00 0x10 0x00 0x00 0x3e
-
-# CHECK: strv %r0, 0(%r15)
-0xe3 0x00 0xf0 0x00 0x00 0x3e
-
-# CHECK: strv %r0, 524287(%r1,%r15)
-0xe3 0x01 0xff 0xff 0x7f 0x3e
-
-# CHECK: strv %r0, 524287(%r15,%r1)
-0xe3 0x0f 0x1f 0xff 0x7f 0x3e
-
-# CHECK: strv %r15, 0
-0xe3 0xf0 0x00 0x00 0x00 0x3e
-
-# CHECK: st %r0, 0
-0x50 0x00 0x00 0x00
-
-# CHECK: st %r0, 4095
-0x50 0x00 0x0f 0xff
-
-# CHECK: st %r0, 0(%r1)
-0x50 0x00 0x10 0x00
-
-# CHECK: st %r0, 0(%r15)
-0x50 0x00 0xf0 0x00
-
-# CHECK: st %r0, 4095(%r1,%r15)
-0x50 0x01 0xff 0xff
-
-# CHECK: st %r0, 4095(%r15,%r1)
-0x50 0x0f 0x1f 0xff
-
-# CHECK: st %r15, 0
-0x50 0xf0 0x00 0x00
-
# CHECK: stoc %r1, 2(%r3), 0
0xeb 0x10 0x30 0x02 0x00 0xf3
@@ -10762,26 +12526,113 @@
# CHECK: stpq %r14, 0
0xe3 0xe0 0x00 0x00 0x00 0x8e
-# CHECK: s %r0, 0
-0x5b 0x00 0x00 0x00
+# CHECK: strag 0, 0
+0xe5 0x02 0x00 0x00 0x00 0x00
-# CHECK: s %r0, 4095
-0x5b 0x00 0x0f 0xff
+# CHECK: strag 0, 4095
+0xe5 0x02 0x00 0x00 0x0f 0xff
-# CHECK: s %r0, 0(%r1)
-0x5b 0x00 0x10 0x00
+# CHECK: strag 0, 0(%r1)
+0xe5 0x02 0x00 0x00 0x10 0x00
-# CHECK: s %r0, 0(%r15)
-0x5b 0x00 0xf0 0x00
+# CHECK: strag 0, 0(%r15)
+0xe5 0x02 0x00 0x00 0xf0 0x00
-# CHECK: s %r0, 4095(%r1,%r15)
-0x5b 0x01 0xff 0xff
+# CHECK: strag 0(%r1), 4095(%r15)
+0xe5 0x02 0x10 0x00 0xff 0xff
-# CHECK: s %r0, 4095(%r15,%r1)
-0x5b 0x0f 0x1f 0xff
+# CHECK: strag 4095(%r1), 0(%r15)
+0xe5 0x02 0x1f 0xff 0xf0 0x00
-# CHECK: s %r15, 0
-0x5b 0xf0 0x00 0x00
+# CHECK: strv %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x3e
+
+# CHECK: strv %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x3e
+
+# CHECK: strv %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x3e
+
+# CHECK: strv %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x3e
+
+# CHECK: strv %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x3e
+
+# CHECK: strv %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x3e
+
+# CHECK: strv %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x3e
+
+# CHECK: strv %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x3e
+
+# CHECK: strv %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x3e
+
+# CHECK: strv %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x3e
+
+# CHECK: strvg %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x2f
+
+# CHECK: strvg %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x2f
+
+# CHECK: strvg %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x2f
+
+# CHECK: strvg %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x2f
+
+# CHECK: strvg %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x2f
+
+# CHECK: strvg %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x2f
+
+# CHECK: strvg %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x2f
+
+# CHECK: strvg %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x2f
+
+# CHECK: strvg %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x2f
+
+# CHECK: strvg %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x2f
+
+# CHECK: strvh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x3f
+
+# CHECK: strvh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x3f
+
+# CHECK: strvh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0x3f
+
+# CHECK: strvh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0x3f
+
+# CHECK: strvh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x3f
+
+# CHECK: strvh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x3f
+
+# CHECK: strvh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x3f
+
+# CHECK: strvh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x3f
+
+# CHECK: strvh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x3f
+
+# CHECK: strvh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x3f
# CHECK: sty %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x50
@@ -10867,9 +12718,6 @@
# CHECK: sy %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x5b
-# CHECK: tam
-0x01 0x0b
-
# CHECK: tabort 0
0xb2 0xfc 0x00 0x00
@@ -10888,6 +12736,9 @@
# CHECK: tabort 4095(%r15)
0xb2 0xfc 0xff 0xff
+# CHECK: tam
+0x01 0x0b
+
# CHECK: tbegin 0, 0
0xe5 0x60 0x00 0x00 0x00 0x00
@@ -11119,24 +12970,411 @@
# CHECK: tmy 524287(%r15), 42
0xeb 0x2a 0xff 0xff 0x7f 0x51
+# CHECK: tp 0(1)
+0xeb 0x00 0x00 0x00 0x00 0xc0
+
+# CHECK: tp 0(1,%r1)
+0xeb 0x00 0x10 0x00 0x00 0xc0
+
+# CHECK: tp 0(1,%r15)
+0xeb 0x00 0xf0 0x00 0x00 0xc0
+
+# CHECK: tp 4095(1,%r1)
+0xeb 0x00 0x1f 0xff 0x00 0xc0
+
+# CHECK: tp 4095(1,%r15)
+0xeb 0x00 0xff 0xff 0x00 0xc0
+
+# CHECK: tp 0(16,%r1)
+0xeb 0xf0 0x10 0x00 0x00 0xc0
+
+# CHECK: tp 0(16,%r15)
+0xeb 0xf0 0xf0 0x00 0x00 0xc0
+
+# CHECK: tr 0(1), 0
+0xdc 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: tr 0(1), 0(%r1)
+0xdc 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: tr 0(1), 0(%r15)
+0xdc 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: tr 0(1), 4095
+0xdc 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: tr 0(1), 4095(%r1)
+0xdc 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: tr 0(1), 4095(%r15)
+0xdc 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: tr 0(1,%r1), 0
+0xdc 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: tr 0(1,%r15), 0
+0xdc 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: tr 4095(1,%r1), 0
+0xdc 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: tr 4095(1,%r15), 0
+0xdc 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: tr 0(256,%r1), 0
+0xdc 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: tr 0(256,%r15), 0
+0xdc 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: tre %r0, %r0
+0xb2 0xa5 0x00 0x00
+
+# CHECK: tre %r0, %r15
+0xb2 0xa5 0x00 0x0f
+
+# CHECK: tre %r14, %r0
+0xb2 0xa5 0x00 0xe0
+
+# CHECK: tre %r6, %r8
+0xb2 0xa5 0x00 0x68
+
+# CHECK: troo %r0, %r0
+0xb9 0x93 0x00 0x00
+
+# CHECK: troo %r0, %r15
+0xb9 0x93 0x00 0x0f
+
+# CHECK: troo %r14, %r0
+0xb9 0x93 0x00 0xe0
+
+# CHECK: troo %r6, %r8
+0xb9 0x93 0x00 0x68
+
+# CHECK: troo %r4, %r12, 1
+0xb9 0x93 0x10 0x4c
+
+# CHECK: troo %r4, %r12, 15
+0xb9 0x93 0xf0 0x4c
+
+# CHECK: trot %r0, %r0
+0xb9 0x92 0x00 0x00
+
+# CHECK: trot %r0, %r15
+0xb9 0x92 0x00 0x0f
+
+# CHECK: trot %r14, %r0
+0xb9 0x92 0x00 0xe0
+
+# CHECK: trot %r6, %r8
+0xb9 0x92 0x00 0x68
+
+# CHECK: trot %r4, %r12, 1
+0xb9 0x92 0x10 0x4c
+
+# CHECK: trot %r4, %r12, 15
+0xb9 0x92 0xf0 0x4c
+
+# CHECK: trt 0(1), 0
+0xdd 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: trt 0(1), 0(%r1)
+0xdd 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: trt 0(1), 0(%r15)
+0xdd 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: trt 0(1), 4095
+0xdd 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: trt 0(1), 4095(%r1)
+0xdd 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: trt 0(1), 4095(%r15)
+0xdd 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: trt 0(1,%r1), 0
+0xdd 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: trt 0(1,%r15), 0
+0xdd 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: trt 4095(1,%r1), 0
+0xdd 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: trt 4095(1,%r15), 0
+0xdd 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: trt 0(256,%r1), 0
+0xdd 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: trt 0(256,%r15), 0
+0xdd 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: trte %r0, %r0
+0xb9 0xbf 0x00 0x00
+
+# CHECK: trte %r0, %r15
+0xb9 0xbf 0x00 0x0f
+
+# CHECK: trte %r14, %r0
+0xb9 0xbf 0x00 0xe0
+
+# CHECK: trte %r6, %r8
+0xb9 0xbf 0x00 0x68
+
+# CHECK: trte %r4, %r12, 1
+0xb9 0xbf 0x10 0x4c
+
+# CHECK: trte %r4, %r12, 15
+0xb9 0xbf 0xf0 0x4c
+
+# CHECK: trto %r0, %r0
+0xb9 0x91 0x00 0x00
+
+# CHECK: trto %r0, %r15
+0xb9 0x91 0x00 0x0f
+
+# CHECK: trto %r14, %r0
+0xb9 0x91 0x00 0xe0
+
+# CHECK: trto %r6, %r8
+0xb9 0x91 0x00 0x68
+
+# CHECK: trto %r4, %r12, 1
+0xb9 0x91 0x10 0x4c
+
+# CHECK: trto %r4, %r12, 15
+0xb9 0x91 0xf0 0x4c
+
+# CHECK: trtr 0(1), 0
+0xd0 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: trtr 0(1), 0(%r1)
+0xd0 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: trtr 0(1), 0(%r15)
+0xd0 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: trtr 0(1), 4095
+0xd0 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: trtr 0(1), 4095(%r1)
+0xd0 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: trtr 0(1), 4095(%r15)
+0xd0 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: trtr 0(1,%r1), 0
+0xd0 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: trtr 0(1,%r15), 0
+0xd0 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: trtr 4095(1,%r1), 0
+0xd0 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: trtr 4095(1,%r15), 0
+0xd0 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: trtr 0(256,%r1), 0
+0xd0 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: trtr 0(256,%r15), 0
+0xd0 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: trtre %r0, %r0
+0xb9 0xbd 0x00 0x00
+
+# CHECK: trtre %r0, %r15
+0xb9 0xbd 0x00 0x0f
+
+# CHECK: trtre %r14, %r0
+0xb9 0xbd 0x00 0xe0
+
+# CHECK: trtre %r6, %r8
+0xb9 0xbd 0x00 0x68
+
+# CHECK: trtre %r4, %r12, 1
+0xb9 0xbd 0x10 0x4c
+
+# CHECK: trtre %r4, %r12, 15
+0xb9 0xbd 0xf0 0x4c
+
+# CHECK: trtt %r0, %r0
+0xb9 0x90 0x00 0x00
+
+# CHECK: trtt %r0, %r15
+0xb9 0x90 0x00 0x0f
+
+# CHECK: trtt %r14, %r0
+0xb9 0x90 0x00 0xe0
+
+# CHECK: trtt %r6, %r8
+0xb9 0x90 0x00 0x68
+
+# CHECK: trtt %r4, %r12, 1
+0xb9 0x90 0x10 0x4c
+
+# CHECK: trtt %r4, %r12, 15
+0xb9 0x90 0xf0 0x4c
+
# CHECK: ts 0
0x93 0x00 0x00 0x00
# CHECK: ts 0(%r1)
0x93 0x00 0x10 0x00
-#CHECK: ts 0(%r15)
+# CHECK: ts 0(%r15)
0x93 0x00 0xf0 0x00
-#CHECK: ts 4095
+# CHECK: ts 4095
0x93 0x00 0x0f 0xff
-#CHECK: ts 4095(%r1)
+# CHECK: ts 4095(%r1)
0x93 0x00 0x1f 0xff
-#CHECK: ts 4095(%r15)
+# CHECK: ts 4095(%r15)
0x93 0x00 0xff 0xff
+# CHECK: unpk 0(1), 0(1)
+0xf3 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: unpk 0(1), 0(1,%r1)
+0xf3 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: unpk 0(1), 0(1,%r15)
+0xf3 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: unpk 0(1), 4095(1)
+0xf3 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: unpk 0(1), 4095(1,%r1)
+0xf3 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: unpk 0(1), 4095(1,%r15)
+0xf3 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: unpk 0(1,%r1), 0(1)
+0xf3 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: unpk 0(1,%r15), 0(1)
+0xf3 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: unpk 4095(1,%r1), 0(1)
+0xf3 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: unpk 4095(1,%r15), 0(1)
+0xf3 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: unpk 0(16,%r1), 0(1)
+0xf3 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: unpk 0(16,%r15), 0(1)
+0xf3 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: unpk 0(1), 0(16,%r1)
+0xf3 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: unpk 0(1), 0(16,%r15)
+0xf3 0x0f 0x00 0x00 0xf0 0x00
+
+# CHECK: unpka 0(1), 0
+0xea 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: unpka 0(1), 0(%r1)
+0xea 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: unpka 0(1), 0(%r15)
+0xea 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: unpka 0(1), 4095
+0xea 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: unpka 0(1), 4095(%r1)
+0xea 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: unpka 0(1), 4095(%r15)
+0xea 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: unpka 0(1,%r1), 0
+0xea 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: unpka 0(1,%r15), 0
+0xea 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: unpka 4095(1,%r1), 0
+0xea 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: unpka 4095(1,%r15), 0
+0xea 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: unpka 0(256,%r1), 0
+0xea 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: unpka 0(256,%r15), 0
+0xea 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: unpku 0(1), 0
+0xe2 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: unpku 0(1), 0(%r1)
+0xe2 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: unpku 0(1), 0(%r15)
+0xe2 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: unpku 0(1), 4095
+0xe2 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: unpku 0(1), 4095(%r1)
+0xe2 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: unpku 0(1), 4095(%r15)
+0xe2 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: unpku 0(1,%r1), 0
+0xe2 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: unpku 0(1,%r15), 0
+0xe2 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: unpku 4095(1,%r1), 0
+0xe2 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: unpku 4095(1,%r15), 0
+0xe2 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: unpku 0(256,%r1), 0
+0xe2 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: unpku 0(256,%r15), 0
+0xe2 0xff 0xf0 0x00 0x00 0x00
+
+# CHECK: upt
+0x01 0x02
+
+# CHECK: x %r0, 0
+0x57 0x00 0x00 0x00
+
+# CHECK: x %r0, 4095
+0x57 0x00 0x0f 0xff
+
+# CHECK: x %r0, 0(%r1)
+0x57 0x00 0x10 0x00
+
+# CHECK: x %r0, 0(%r15)
+0x57 0x00 0xf0 0x00
+
+# CHECK: x %r0, 4095(%r1,%r15)
+0x57 0x01 0xff 0xff
+
+# CHECK: x %r0, 4095(%r15,%r1)
+0x57 0x0f 0x1f 0xff
+
+# CHECK: x %r15, 0
+0x57 0xf0 0x00 0x00
+
# CHECK: xc 0(1), 0
0xd7 0x00 0x00 0x00 0x00 0x00
@@ -11173,24 +13411,6 @@
# CHECK: xc 0(256,%r15), 0
0xd7 0xff 0xf0 0x00 0x00 0x00
-# CHECK: xgr %r0, %r0
-0xb9 0x82 0x00 0x00
-
-# CHECK: xgr %r0, %r15
-0xb9 0x82 0x00 0x0f
-
-# CHECK: xgr %r15, %r0
-0xb9 0x82 0x00 0xf0
-
-# CHECK: xgr %r7, %r8
-0xb9 0x82 0x00 0x78
-
-# CHECK: xgrk %r0, %r0, %r0
-0xb9 0xe7 0x00 0x00
-
-# CHECK: xgrk %r2, %r3, %r4
-0xb9 0xe7 0x40 0x23
-
# CHECK: xg %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x82
@@ -11221,23 +13441,23 @@
# CHECK: xg %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x82
-# CHECK: xihf %r0, 0
-0xc0 0x06 0x00 0x00 0x00 0x00
+# CHECK: xgr %r0, %r0
+0xb9 0x82 0x00 0x00
-# CHECK: xihf %r0, 4294967295
-0xc0 0x06 0xff 0xff 0xff 0xff
+# CHECK: xgr %r0, %r15
+0xb9 0x82 0x00 0x0f
-# CHECK: xihf %r15, 0
-0xc0 0xf6 0x00 0x00 0x00 0x00
+# CHECK: xgr %r15, %r0
+0xb9 0x82 0x00 0xf0
-# CHECK: xilf %r0, 0
-0xc0 0x07 0x00 0x00 0x00 0x00
+# CHECK: xgr %r7, %r8
+0xb9 0x82 0x00 0x78
-# CHECK: xilf %r0, 4294967295
-0xc0 0x07 0xff 0xff 0xff 0xff
+# CHECK: xgrk %r0, %r0, %r0
+0xb9 0xe7 0x00 0x00
-# CHECK: xilf %r15, 0
-0xc0 0xf7 0x00 0x00 0x00 0x00
+# CHECK: xgrk %r2, %r3, %r4
+0xb9 0xe7 0x40 0x23
# CHECK: xi 0, 0
0x97 0x00 0x00 0x00
@@ -11260,6 +13480,24 @@
# CHECK: xi 4095(%r15), 42
0x97 0x2a 0xff 0xff
+# CHECK: xihf %r0, 0
+0xc0 0x06 0x00 0x00 0x00 0x00
+
+# CHECK: xihf %r0, 4294967295
+0xc0 0x06 0xff 0xff 0xff 0xff
+
+# CHECK: xihf %r15, 0
+0xc0 0xf6 0x00 0x00 0x00 0x00
+
+# CHECK: xilf %r0, 0
+0xc0 0x07 0x00 0x00 0x00 0x00
+
+# CHECK: xilf %r0, 4294967295
+0xc0 0x07 0xff 0xff 0xff 0xff
+
+# CHECK: xilf %r15, 0
+0xc0 0xf7 0x00 0x00 0x00 0x00
+
# CHECK: xiy -524288, 0
0xeb 0x00 0x00 0x00 0x80 0x57
@@ -11308,27 +13546,6 @@
# CHECK: xrk %r2, %r3, %r4
0xb9 0xf7 0x40 0x23
-# CHECK: x %r0, 0
-0x57 0x00 0x00 0x00
-
-# CHECK: x %r0, 4095
-0x57 0x00 0x0f 0xff
-
-# CHECK: x %r0, 0(%r1)
-0x57 0x00 0x10 0x00
-
-# CHECK: x %r0, 0(%r15)
-0x57 0x00 0xf0 0x00
-
-# CHECK: x %r0, 4095(%r1,%r15)
-0x57 0x01 0xff 0xff
-
-# CHECK: x %r0, 4095(%r15,%r1)
-0x57 0x0f 0x1f 0xff
-
-# CHECK: x %r15, 0
-0x57 0xf0 0x00 0x00
-
# CHECK: xy %r0, -524288
0xe3 0x00 0x00 0x00 0x80 0x57
@@ -11358,3 +13575,45 @@
# CHECK: xy %r15, 0
0xe3 0xf0 0x00 0x00 0x00 0x57
+
+# CHECK: zap 0(1), 0(1)
+0xf8 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: zap 0(1), 0(1,%r1)
+0xf8 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: zap 0(1), 0(1,%r15)
+0xf8 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: zap 0(1), 4095(1)
+0xf8 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: zap 0(1), 4095(1,%r1)
+0xf8 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: zap 0(1), 4095(1,%r15)
+0xf8 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: zap 0(1,%r1), 0(1)
+0xf8 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: zap 0(1,%r15), 0(1)
+0xf8 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: zap 4095(1,%r1), 0(1)
+0xf8 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: zap 4095(1,%r15), 0(1)
+0xf8 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: zap 0(16,%r1), 0(1)
+0xf8 0xf0 0x10 0x00 0x00 0x00
+
+# CHECK: zap 0(16,%r15), 0(1)
+0xf8 0xf0 0xf0 0x00 0x00 0x00
+
+# CHECK: zap 0(1), 0(16,%r1)
+0xf8 0x0f 0x00 0x00 0x10 0x00
+
+# CHECK: zap 0(1), 0(16,%r15)
+0xf8 0x0f 0x00 0x00 0xf0 0x00
diff --git a/test/MC/SystemZ/insn-bad-z13.s b/test/MC/SystemZ/insn-bad-z13.s
index db2de118bf36..82f47feeb8a9 100644
--- a/test/MC/SystemZ/insn-bad-z13.s
+++ b/test/MC/SystemZ/insn-bad-z13.s
@@ -5,6 +5,89 @@
# RUN: FileCheck < %t %s
#CHECK: error: invalid operand
+#CHECK: lcbb %r0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: lcbb %r0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: lcbb %r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: lcbb %r0, 4096, 0
+#CHECK: error: invalid use of vector addressing
+#CHECK: lcbb %r0, 0(%v1,%r2), 0
+
+ lcbb %r0, 0, -1
+ lcbb %r0, 0, 16
+ lcbb %r0, -1, 0
+ lcbb %r0, 4096, 0
+ lcbb %r0, 0(%v1,%r2), 0
+
+#CHECK: error: invalid operand
+#CHECK: llzrgf %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: llzrgf %r0, 524288
+
+ llzrgf %r0, -524289
+ llzrgf %r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: locfh %r0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: locfh %r0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: locfh %r0, -524289, 1
+#CHECK: error: invalid operand
+#CHECK: locfh %r0, 524288, 1
+#CHECK: error: invalid use of indexed addressing
+#CHECK: locfh %r0, 0(%r1,%r2), 1
+
+ locfh %r0, 0, -1
+ locfh %r0, 0, 16
+ locfh %r0, -524289, 1
+ locfh %r0, 524288, 1
+ locfh %r0, 0(%r1,%r2), 1
+
+#CHECK: error: invalid operand
+#CHECK: locfhr %r0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: locfhr %r0, %r0, 16
+
+ locfhr %r0, %r0, -1
+ locfhr %r0, %r0, 16
+
+#CHECK: error: invalid operand
+#CHECK: locghie %r0, 66000
+#CHECK: error: invalid operand
+#CHECK: locghie %f0, 0
+#CHECK: error: invalid operand
+#CHECK: locghie 0, %r0
+
+ locghie %r0, 66000
+ locghie %f0, 0
+ locghie 0, %r0
+
+#CHECK: error: invalid operand
+#CHECK: lochhie %r0, 66000
+#CHECK: error: invalid operand
+#CHECK: lochhie %f0, 0
+#CHECK: error: invalid operand
+#CHECK: lochhie 0, %r0
+
+ lochhie %r0, 66000
+ lochhie %f0, 0
+ lochhie 0, %r0
+
+#CHECK: error: invalid operand
+#CHECK: lochie %r0, 66000
+#CHECK: error: invalid operand
+#CHECK: lochie %f0, 0
+#CHECK: error: invalid operand
+#CHECK: lochie 0, %r0
+
+ lochie %r0, 66000
+ lochie %f0, 0
+ lochie 0, %r0
+
+#CHECK: error: invalid operand
#CHECK: lzrf %r0, -524289
#CHECK: error: invalid operand
#CHECK: lzrf %r0, 524288
@@ -20,30 +103,30 @@
lzrg %r0, -524289
lzrg %r0, 524288
-#CHECK: error: invalid operand
-#CHECK: llzrgf %r0, -524289
-#CHECK: error: invalid operand
-#CHECK: llzrgf %r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: ppno %r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: ppno %r2, %r1
- llzrgf %r0, -524289
- llzrgf %r0, 524288
+ ppno %r1, %r2
+ ppno %r2, %r1
#CHECK: error: invalid operand
-#CHECK: lcbb %r0, 0, -1
+#CHECK: stocfh %r0, 0, -1
#CHECK: error: invalid operand
-#CHECK: lcbb %r0, 0, 16
+#CHECK: stocfh %r0, 0, 16
#CHECK: error: invalid operand
-#CHECK: lcbb %r0, -1, 0
+#CHECK: stocfh %r0, -524289, 1
#CHECK: error: invalid operand
-#CHECK: lcbb %r0, 4096, 0
-#CHECK: error: invalid use of vector addressing
-#CHECK: lcbb %r0, 0(%v1,%r2), 0
+#CHECK: stocfh %r0, 524288, 1
+#CHECK: error: invalid use of indexed addressing
+#CHECK: stocfh %r0, 0(%r1,%r2), 1
- lcbb %r0, 0, -1
- lcbb %r0, 0, 16
- lcbb %r0, -1, 0
- lcbb %r0, 4096, 0
- lcbb %r0, 0(%v1,%r2), 0
+ stocfh %r0, 0, -1
+ stocfh %r0, 0, 16
+ stocfh %r0, -524289, 1
+ stocfh %r0, 524288, 1
+ stocfh %r0, 0(%r1,%r2), 1
#CHECK: error: invalid operand
#CHECK: vcdg %v0, %v0, 0, 0, -1
@@ -474,6 +557,20 @@
vfaef %v0, %v0, %v0, 0, 0
#CHECK: error: invalid operand
+#CHECK: vfaefs %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaefs %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaefs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaefs %v0, %v0, %v0, 0, 0
+
+ vfaefs %v0, %v0, %v0, -1
+ vfaefs %v0, %v0, %v0, 16
+ vfaefs %v0, %v0
+ vfaefs %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
#CHECK: vfaeh %v0, %v0, %v0, -1
#CHECK: error: invalid operand
#CHECK: vfaeh %v0, %v0, %v0, 16
@@ -488,18 +585,60 @@
vfaeh %v0, %v0, %v0, 0, 0
#CHECK: error: invalid operand
-#CHECK: vfaezh %v0, %v0, %v0, -1
+#CHECK: vfaehs %v0, %v0, %v0, -1
#CHECK: error: invalid operand
-#CHECK: vfaezh %v0, %v0, %v0, 16
+#CHECK: vfaehs %v0, %v0, %v0, 16
#CHECK: error: too few operands
-#CHECK: vfaezh %v0, %v0
+#CHECK: vfaehs %v0, %v0
#CHECK: error: invalid operand
-#CHECK: vfaezh %v0, %v0, %v0, 0, 0
+#CHECK: vfaehs %v0, %v0, %v0, 0, 0
- vfaezh %v0, %v0, %v0, -1
- vfaezh %v0, %v0, %v0, 16
- vfaezh %v0, %v0
- vfaezh %v0, %v0, %v0, 0, 0
+ vfaehs %v0, %v0, %v0, -1
+ vfaehs %v0, %v0, %v0, 16
+ vfaehs %v0, %v0
+ vfaehs %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfaezb %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaezb %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaezb %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaezb %v0, %v0, %v0, 0, 0
+
+ vfaezb %v0, %v0, %v0, -1
+ vfaezb %v0, %v0, %v0, 16
+ vfaezb %v0, %v0
+ vfaezb %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfaezbs %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaezbs %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaezbs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaezbs %v0, %v0, %v0, 0, 0
+
+ vfaezbs %v0, %v0, %v0, -1
+ vfaezbs %v0, %v0, %v0, 16
+ vfaezbs %v0, %v0
+ vfaezbs %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfaezf %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaezf %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaezf %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaezf %v0, %v0, %v0, 0, 0
+
+ vfaezf %v0, %v0, %v0, -1
+ vfaezf %v0, %v0, %v0, 16
+ vfaezf %v0, %v0
+ vfaezf %v0, %v0, %v0, 0, 0
#CHECK: error: invalid operand
#CHECK: vfaezfs %v0, %v0, %v0, -1
@@ -516,6 +655,34 @@
vfaezfs %v0, %v0, %v0, 0, 0
#CHECK: error: invalid operand
+#CHECK: vfaezh %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaezh %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaezh %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaezh %v0, %v0, %v0, 0, 0
+
+ vfaezh %v0, %v0, %v0, -1
+ vfaezh %v0, %v0, %v0, 16
+ vfaezh %v0, %v0
+ vfaezh %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfaezhs %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfaezhs %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfaezhs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfaezhs %v0, %v0, %v0, 0, 0
+
+ vfaezhs %v0, %v0, %v0, -1
+ vfaezhs %v0, %v0, %v0, 16
+ vfaezhs %v0, %v0
+ vfaezhs %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
#CHECK: vfee %v0, %v0, %v0, 0, -1
#CHECK: error: invalid operand
#CHECK: vfee %v0, %v0, %v0, 0, 16
@@ -549,6 +716,14 @@
vfeeb %v0, %v0
vfeeb %v0, %v0, %v0, 0, 0
+#CHECK: error: too few operands
+#CHECK: vfeebs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeebs %v0, %v0, %v0, 0
+
+ vfeebs %v0, %v0
+ vfeebs %v0, %v0, %v0, 0
+
#CHECK: error: invalid operand
#CHECK: vfeef %v0, %v0, %v0, -1
#CHECK: error: invalid operand
@@ -563,6 +738,14 @@
vfeef %v0, %v0
vfeef %v0, %v0, %v0, 0, 0
+#CHECK: error: too few operands
+#CHECK: vfeefs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeefs %v0, %v0, %v0, 0
+
+ vfeefs %v0, %v0
+ vfeefs %v0, %v0, %v0, 0
+
#CHECK: error: invalid operand
#CHECK: vfeeh %v0, %v0, %v0, -1
#CHECK: error: invalid operand
@@ -578,22 +761,6 @@
vfeeh %v0, %v0, %v0, 0, 0
#CHECK: error: too few operands
-#CHECK: vfeebs %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfeebs %v0, %v0, %v0, 0
-
- vfeebs %v0, %v0
- vfeebs %v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfeefs %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfeefs %v0, %v0, %v0, 0
-
- vfeefs %v0, %v0
- vfeefs %v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
#CHECK: vfeehs %v0, %v0
#CHECK: error: invalid operand
#CHECK: vfeehs %v0, %v0, %v0, 0
@@ -610,6 +777,14 @@
vfeezb %v0, %v0, %v0, 0
#CHECK: error: too few operands
+#CHECK: vfeezbs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeezbs %v0, %v0, %v0, 0
+
+ vfeezbs %v0, %v0
+ vfeezbs %v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
#CHECK: vfeezf %v0, %v0
#CHECK: error: invalid operand
#CHECK: vfeezf %v0, %v0, %v0, 0
@@ -618,6 +793,14 @@
vfeezf %v0, %v0, %v0, 0
#CHECK: error: too few operands
+#CHECK: vfeezfs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeezfs %v0, %v0, %v0, 0
+
+ vfeezfs %v0, %v0
+ vfeezfs %v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
#CHECK: vfeezh %v0, %v0
#CHECK: error: invalid operand
#CHECK: vfeezh %v0, %v0, %v0, 0
@@ -626,28 +809,146 @@
vfeezh %v0, %v0, %v0, 0
#CHECK: error: too few operands
-#CHECK: vfeezbs %v0, %v0
+#CHECK: vfeezhs %v0, %v0
#CHECK: error: invalid operand
-#CHECK: vfeezbs %v0, %v0, %v0, 0
+#CHECK: vfeezhs %v0, %v0, %v0, 0
- vfeezbs %v0, %v0
- vfeezbs %v0, %v0, %v0, 0
+ vfeezhs %v0, %v0
+ vfeezhs %v0, %v0, %v0, 0
+#CHECK: error: invalid operand
+#CHECK: vfene %v0, %v0, %v0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: vfene %v0, %v0, %v0, 0, 16
+#CHECK: error: invalid operand
+#CHECK: vfene %v0, %v0, %v0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: vfene %v0, %v0, %v0, 16, 0
#CHECK: error: too few operands
-#CHECK: vfeezfs %v0, %v0
+#CHECK: vfene %v0, %v0, %v0
#CHECK: error: invalid operand
-#CHECK: vfeezfs %v0, %v0, %v0, 0
+#CHECK: vfene %v0, %v0, %v0, 0, 0, 0
- vfeezfs %v0, %v0
- vfeezfs %v0, %v0, %v0, 0
+ vfene %v0, %v0, %v0, 0, -1
+ vfene %v0, %v0, %v0, 0, 16
+ vfene %v0, %v0, %v0, -1, 0
+ vfene %v0, %v0, %v0, 16, 0
+ vfene %v0, %v0, %v0
+ vfene %v0, %v0, %v0, 0, 0, 0
+#CHECK: error: invalid operand
+#CHECK: vfeneb %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfeneb %v0, %v0, %v0, 16
#CHECK: error: too few operands
-#CHECK: vfeezhs %v0, %v0
+#CHECK: vfeneb %v0, %v0
#CHECK: error: invalid operand
-#CHECK: vfeezhs %v0, %v0, %v0, 0
+#CHECK: vfeneb %v0, %v0, %v0, 0, 0
- vfeezhs %v0, %v0
- vfeezhs %v0, %v0, %v0, 0
+ vfeneb %v0, %v0, %v0, -1
+ vfeneb %v0, %v0, %v0, 16
+ vfeneb %v0, %v0
+ vfeneb %v0, %v0, %v0, 0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenebs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenebs %v0, %v0, %v0, 0
+
+ vfenebs %v0, %v0
+ vfenebs %v0, %v0, %v0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfenef %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfenef %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfenef %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenef %v0, %v0, %v0, 0, 0
+
+ vfenef %v0, %v0, %v0, -1
+ vfenef %v0, %v0, %v0, 16
+ vfenef %v0, %v0
+ vfenef %v0, %v0, %v0, 0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenefs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenefs %v0, %v0, %v0, 0
+
+ vfenefs %v0, %v0
+ vfenefs %v0, %v0, %v0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vfeneh %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vfeneh %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vfeneh %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfeneh %v0, %v0, %v0, 0, 0
+
+ vfeneh %v0, %v0, %v0, -1
+ vfeneh %v0, %v0, %v0, 16
+ vfeneh %v0, %v0
+ vfeneh %v0, %v0, %v0, 0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenehs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenehs %v0, %v0, %v0, 0
+
+ vfenehs %v0, %v0
+ vfenehs %v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenezb %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenezb %v0, %v0, %v0, 0
+
+ vfenezb %v0, %v0
+ vfenezb %v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenezbs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenezbs %v0, %v0, %v0, 0
+
+ vfenezbs %v0, %v0
+ vfenezbs %v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenezf %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenezf %v0, %v0, %v0, 0
+
+ vfenezf %v0, %v0
+ vfenezf %v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenezfs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenezfs %v0, %v0, %v0, 0
+
+ vfenezfs %v0, %v0
+ vfenezfs %v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenezh %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenezh %v0, %v0, %v0, 0
+
+ vfenezh %v0, %v0
+ vfenezh %v0, %v0, %v0, 0
+
+#CHECK: error: too few operands
+#CHECK: vfenezhs %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vfenezhs %v0, %v0, %v0, 0
+
+ vfenezhs %v0, %v0
+ vfenezhs %v0, %v0, %v0, 0
#CHECK: error: invalid operand
#CHECK: vfi %v0, %v0, 0, 0, -1
@@ -869,6 +1170,14 @@
vistrb %v0
vistrb %v0, %v0, 0, 0
+#CHECK: error: too few operands
+#CHECK: vistrbs %v0
+#CHECK: error: invalid operand
+#CHECK: vistrbs %v0, %v0, 0
+
+ vistrbs %v0
+ vistrbs %v0, %v0, 0
+
#CHECK: error: invalid operand
#CHECK: vistrf %v0, %v0, -1
#CHECK: error: invalid operand
@@ -883,6 +1192,14 @@
vistrf %v0
vistrf %v0, %v0, 0, 0
+#CHECK: error: too few operands
+#CHECK: vistrfs %v0
+#CHECK: error: invalid operand
+#CHECK: vistrfs %v0, %v0, 0
+
+ vistrfs %v0
+ vistrfs %v0, %v0, 0
+
#CHECK: error: invalid operand
#CHECK: vistrh %v0, %v0, -1
#CHECK: error: invalid operand
@@ -898,22 +1215,6 @@
vistrh %v0, %v0, 0, 0
#CHECK: error: too few operands
-#CHECK: vistrbs %v0
-#CHECK: error: invalid operand
-#CHECK: vistrbs %v0, %v0, 0
-
- vistrbs %v0
- vistrbs %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vistrfs %v0
-#CHECK: error: invalid operand
-#CHECK: vistrfs %v0, %v0, 0
-
- vistrfs %v0
- vistrfs %v0, %v0, 0
-
-#CHECK: error: too few operands
#CHECK: vistrhs %v0
#CHECK: error: invalid operand
#CHECK: vistrhs %v0, %v0, 0
@@ -1371,132 +1672,6 @@
vlvgh %v0, %r0, 0(%r0)
#CHECK: error: invalid operand
-#CHECK: vfene %v0, %v0, %v0, 0, -1
-#CHECK: error: invalid operand
-#CHECK: vfene %v0, %v0, %v0, 0, 16
-#CHECK: error: invalid operand
-#CHECK: vfene %v0, %v0, %v0, -1, 0
-#CHECK: error: invalid operand
-#CHECK: vfene %v0, %v0, %v0, 16, 0
-#CHECK: error: too few operands
-#CHECK: vfene %v0, %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfene %v0, %v0, %v0, 0, 0, 0
-
- vfene %v0, %v0, %v0, 0, -1
- vfene %v0, %v0, %v0, 0, 16
- vfene %v0, %v0, %v0, -1, 0
- vfene %v0, %v0, %v0, 16, 0
- vfene %v0, %v0, %v0
- vfene %v0, %v0, %v0, 0, 0, 0
-
-#CHECK: error: invalid operand
-#CHECK: vfeneb %v0, %v0, %v0, -1
-#CHECK: error: invalid operand
-#CHECK: vfeneb %v0, %v0, %v0, 16
-#CHECK: error: too few operands
-#CHECK: vfeneb %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfeneb %v0, %v0, %v0, 0, 0
-
- vfeneb %v0, %v0, %v0, -1
- vfeneb %v0, %v0, %v0, 16
- vfeneb %v0, %v0
- vfeneb %v0, %v0, %v0, 0, 0
-
-#CHECK: error: invalid operand
-#CHECK: vfenef %v0, %v0, %v0, -1
-#CHECK: error: invalid operand
-#CHECK: vfenef %v0, %v0, %v0, 16
-#CHECK: error: too few operands
-#CHECK: vfenef %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenef %v0, %v0, %v0, 0, 0
-
- vfenef %v0, %v0, %v0, -1
- vfenef %v0, %v0, %v0, 16
- vfenef %v0, %v0
- vfenef %v0, %v0, %v0, 0, 0
-
-#CHECK: error: invalid operand
-#CHECK: vfeneh %v0, %v0, %v0, -1
-#CHECK: error: invalid operand
-#CHECK: vfeneh %v0, %v0, %v0, 16
-#CHECK: error: too few operands
-#CHECK: vfeneh %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfeneh %v0, %v0, %v0, 0, 0
-
- vfeneh %v0, %v0, %v0, -1
- vfeneh %v0, %v0, %v0, 16
- vfeneh %v0, %v0
- vfeneh %v0, %v0, %v0, 0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenebs %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenebs %v0, %v0, %v0, 0
-
- vfenebs %v0, %v0
- vfenebs %v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenefs %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenefs %v0, %v0, %v0, 0
-
- vfenefs %v0, %v0
- vfenefs %v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenehs %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenehs %v0, %v0, %v0, 0
-
- vfenehs %v0, %v0
- vfenehs %v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenezb %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenezb %v0, %v0, %v0, 0
-
- vfenezb %v0, %v0
- vfenezb %v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenezf %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenezf %v0, %v0, %v0, 0
-
- vfenezf %v0, %v0
- vfenezf %v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenezh %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenezh %v0, %v0, %v0, 0
-
- vfenezh %v0, %v0
- vfenezh %v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenezbs %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenezbs %v0, %v0, %v0, 0
-
- vfenezbs %v0, %v0
- vfenezbs %v0, %v0, %v0, 0
-
-#CHECK: error: too few operands
-#CHECK: vfenezfs %v0, %v0
-#CHECK: error: invalid operand
-#CHECK: vfenezfs %v0, %v0, %v0, 0
-
- vfenezfs %v0, %v0
- vfenezfs %v0, %v0, %v0, 0
-
-#CHECK: error: invalid operand
#CHECK: vpdi %v0, %v0, %v0, -1
#CHECK: error: invalid operand
#CHECK: vpdi %v0, %v0, %v0, 16
@@ -1755,12 +1930,12 @@
#CHECK: error: invalid operand
#CHECK: vstrc %v0, %v0, %v0, %v0, 0, 0, 0
- vstrc %v0, %v0, %v0, %v0, 0, -1
- vstrc %v0, %v0, %v0, %v0, 0, 16
- vstrc %v0, %v0, %v0, %v0, -1, 0
- vstrc %v0, %v0, %v0, %v0, 16, 0
- vstrc %v0, %v0, %v0, %v0
- vstrc %v0, %v0, %v0, %v0, 0, 0, 0
+ vstrc %v0, %v0, %v0, %v0, 0, -1
+ vstrc %v0, %v0, %v0, %v0, 0, 16
+ vstrc %v0, %v0, %v0, %v0, -1, 0
+ vstrc %v0, %v0, %v0, %v0, 16, 0
+ vstrc %v0, %v0, %v0, %v0
+ vstrc %v0, %v0, %v0, %v0, 0, 0, 0
#CHECK: error: invalid operand
#CHECK: vstrcb %v0, %v0, %v0, %v0, -1
@@ -1771,10 +1946,10 @@
#CHECK: error: invalid operand
#CHECK: vstrcb %v0, %v0, %v0, %v0, 0, 0
- vstrcb %v0, %v0, %v0, %v0, -1
- vstrcb %v0, %v0, %v0, %v0, 16
- vstrcb %v0, %v0, %v0
- vstrcb %v0, %v0, %v0, %v0, 0, 0
+ vstrcb %v0, %v0, %v0, %v0, -1
+ vstrcb %v0, %v0, %v0, %v0, 16
+ vstrcb %v0, %v0, %v0
+ vstrcb %v0, %v0, %v0, %v0, 0, 0
#CHECK: error: invalid operand
#CHECK: vstrcbs %v0, %v0, %v0, %v0, -1
@@ -1785,10 +1960,10 @@
#CHECK: error: invalid operand
#CHECK: vstrcbs %v0, %v0, %v0, %v0, 0, 0
- vstrcbs %v0, %v0, %v0, %v0, -1
- vstrcbs %v0, %v0, %v0, %v0, 16
- vstrcbs %v0, %v0, %v0
- vstrcbs %v0, %v0, %v0, %v0, 0, 0
+ vstrcbs %v0, %v0, %v0, %v0, -1
+ vstrcbs %v0, %v0, %v0, %v0, 16
+ vstrcbs %v0, %v0, %v0
+ vstrcbs %v0, %v0, %v0, %v0, 0, 0
#CHECK: error: invalid operand
#CHECK: vstrcf %v0, %v0, %v0, %v0, -1
@@ -1799,10 +1974,24 @@
#CHECK: error: invalid operand
#CHECK: vstrcf %v0, %v0, %v0, %v0, 0, 0
- vstrcf %v0, %v0, %v0, %v0, -1
- vstrcf %v0, %v0, %v0, %v0, 16
- vstrcf %v0, %v0, %v0
- vstrcf %v0, %v0, %v0, %v0, 0, 0
+ vstrcf %v0, %v0, %v0, %v0, -1
+ vstrcf %v0, %v0, %v0, %v0, 16
+ vstrcf %v0, %v0, %v0
+ vstrcf %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrcfs %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrcfs %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrcfs %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrcfs %v0, %v0, %v0, %v0, 0, 0
+
+ vstrcfs %v0, %v0, %v0, %v0, -1
+ vstrcfs %v0, %v0, %v0, %v0, 16
+ vstrcfs %v0, %v0, %v0
+ vstrcfs %v0, %v0, %v0, %v0, 0, 0
#CHECK: error: invalid operand
#CHECK: vstrch %v0, %v0, %v0, %v0, -1
@@ -1813,24 +2002,66 @@
#CHECK: error: invalid operand
#CHECK: vstrch %v0, %v0, %v0, %v0, 0, 0
- vstrch %v0, %v0, %v0, %v0, -1
- vstrch %v0, %v0, %v0, %v0, 16
- vstrch %v0, %v0, %v0
- vstrch %v0, %v0, %v0, %v0, 0, 0
+ vstrch %v0, %v0, %v0, %v0, -1
+ vstrch %v0, %v0, %v0, %v0, 16
+ vstrch %v0, %v0, %v0
+ vstrch %v0, %v0, %v0, %v0, 0, 0
#CHECK: error: invalid operand
-#CHECK: vstrczh %v0, %v0, %v0, %v0, -1
+#CHECK: vstrchs %v0, %v0, %v0, %v0, -1
#CHECK: error: invalid operand
-#CHECK: vstrczh %v0, %v0, %v0, %v0, 16
+#CHECK: vstrchs %v0, %v0, %v0, %v0, 16
#CHECK: error: too few operands
-#CHECK: vstrczh %v0, %v0, %v0
+#CHECK: vstrchs %v0, %v0, %v0
#CHECK: error: invalid operand
-#CHECK: vstrczh %v0, %v0, %v0, %v0, 0, 0
+#CHECK: vstrchs %v0, %v0, %v0, %v0, 0, 0
- vstrczh %v0, %v0, %v0, %v0, -1
- vstrczh %v0, %v0, %v0, %v0, 16
- vstrczh %v0, %v0, %v0
- vstrczh %v0, %v0, %v0, %v0, 0, 0
+ vstrchs %v0, %v0, %v0, %v0, -1
+ vstrchs %v0, %v0, %v0, %v0, 16
+ vstrchs %v0, %v0, %v0
+ vstrchs %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrczb %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrczb %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrczb %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrczb %v0, %v0, %v0, %v0, 0, 0
+
+ vstrczb %v0, %v0, %v0, %v0, -1
+ vstrczb %v0, %v0, %v0, %v0, 16
+ vstrczb %v0, %v0, %v0
+ vstrczb %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrczbs %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrczbs %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrczbs %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrczbs %v0, %v0, %v0, %v0, 0, 0
+
+ vstrczbs %v0, %v0, %v0, %v0, -1
+ vstrczbs %v0, %v0, %v0, %v0, 16
+ vstrczbs %v0, %v0, %v0
+ vstrczbs %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrczf %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrczf %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrczf %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrczf %v0, %v0, %v0, %v0, 0, 0
+
+ vstrczf %v0, %v0, %v0, %v0, -1
+ vstrczf %v0, %v0, %v0, %v0, 16
+ vstrczf %v0, %v0, %v0
+ vstrczf %v0, %v0, %v0, %v0, 0, 0
#CHECK: error: invalid operand
#CHECK: vstrczfs %v0, %v0, %v0, %v0, -1
@@ -1841,10 +2072,38 @@
#CHECK: error: invalid operand
#CHECK: vstrczfs %v0, %v0, %v0, %v0, 0, 0
- vstrczfs %v0, %v0, %v0, %v0, -1
- vstrczfs %v0, %v0, %v0, %v0, 16
- vstrczfs %v0, %v0, %v0
- vstrczfs %v0, %v0, %v0, %v0, 0, 0
+ vstrczfs %v0, %v0, %v0, %v0, -1
+ vstrczfs %v0, %v0, %v0, %v0, 16
+ vstrczfs %v0, %v0, %v0
+ vstrczfs %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrczh %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrczh %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrczh %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrczh %v0, %v0, %v0, %v0, 0, 0
+
+ vstrczh %v0, %v0, %v0, %v0, -1
+ vstrczh %v0, %v0, %v0, %v0, 16
+ vstrczh %v0, %v0, %v0
+ vstrczh %v0, %v0, %v0, %v0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: vstrczhs %v0, %v0, %v0, %v0, -1
+#CHECK: error: invalid operand
+#CHECK: vstrczhs %v0, %v0, %v0, %v0, 16
+#CHECK: error: too few operands
+#CHECK: vstrczhs %v0, %v0, %v0
+#CHECK: error: invalid operand
+#CHECK: vstrczhs %v0, %v0, %v0, %v0, 0, 0
+
+ vstrczhs %v0, %v0, %v0, %v0, -1
+ vstrczhs %v0, %v0, %v0, %v0, 16
+ vstrczhs %v0, %v0, %v0
+ vstrczhs %v0, %v0, %v0, %v0, 0, 0
#CHECK: error: invalid operand
#CHECK: wcdgb %v0, %v0, 0, -1
@@ -1937,79 +2196,4 @@
wledb %v0, %v0, 0, 16
wledb %v0, %v0, -1, 0
wledb %v0, %v0, 16, 0
-
-#CHECK: error: invalid operand
-#CHECK: lochie %r0, 66000
-#CHECK: error: invalid operand
-#CHECK: lochie %f0, 0
-#CHECK: error: invalid operand
-#CHECK: lochie 0, %r0
-
- lochie %r0, 66000
- lochie %f0, 0
- lochie 0, %r0
-
-#CHECK: error: invalid operand
-#CHECK: locghie %r0, 66000
-#CHECK: error: invalid operand
-#CHECK: locghie %f0, 0
-#CHECK: error: invalid operand
-#CHECK: locghie 0, %r0
-
- locghie %r0, 66000
- locghie %f0, 0
- locghie 0, %r0
-
-#CHECK: error: invalid operand
-#CHECK: lochhie %r0, 66000
-#CHECK: error: invalid operand
-#CHECK: lochhie %f0, 0
-#CHECK: error: invalid operand
-#CHECK: lochhie 0, %r0
-
- lochhie %r0, 66000
- lochhie %f0, 0
- lochhie 0, %r0
-
-#CHECK: error: invalid operand
-#CHECK: locfh %r0,0,-1
-#CHECK: error: invalid operand
-#CHECK: locfh %r0,0,16
-#CHECK: error: invalid operand
-#CHECK: locfh %r0,-524289,1
-#CHECK: error: invalid operand
-#CHECK: locfh %r0,524288,1
-#CHECK: error: invalid use of indexed addressing
-#CHECK: locfh %r0,0(%r1,%r2),1
-
- locfh %r0,0,-1
- locfh %r0,0,16
- locfh %r0,-524289,1
- locfh %r0,524288,1
- locfh %r0,0(%r1,%r2),1
-
-#CHECK: error: invalid operand
-#CHECK: locfhr %r0,%r0,-1
-#CHECK: error: invalid operand
-#CHECK: locfhr %r0,%r0,16
-
- locfhr %r0,%r0,-1
- locfhr %r0,%r0,16
-
-#CHECK: error: invalid operand
-#CHECK: stocfh %r0,0,-1
-#CHECK: error: invalid operand
-#CHECK: stocfh %r0,0,16
-#CHECK: error: invalid operand
-#CHECK: stocfh %r0,-524289,1
-#CHECK: error: invalid operand
-#CHECK: stocfh %r0,524288,1
-#CHECK: error: invalid use of indexed addressing
-#CHECK: stocfh %r0,0(%r1,%r2),1
-
- stocfh %r0,0,-1
- stocfh %r0,0,16
- stocfh %r0,-524289,1
- stocfh %r0,524288,1
- stocfh %r0,0(%r1,%r2),1
diff --git a/test/MC/SystemZ/insn-bad-z196.s b/test/MC/SystemZ/insn-bad-z196.s
index e370f10eefb4..04c19ff6319c 100644
--- a/test/MC/SystemZ/insn-bad-z196.s
+++ b/test/MC/SystemZ/insn-bad-z196.s
@@ -503,6 +503,33 @@
fixbra %f0, 0, %f2, 0
fixbra %f2, 0, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: kmctr %r1, %r2, %r4
+#CHECK: error: invalid register pair
+#CHECK: kmctr %r2, %r1, %r4
+#CHECK: error: invalid register pair
+#CHECK: kmctr %r2, %r4, %r1
+
+ kmctr %r1, %r2, %r4
+ kmctr %r2, %r1, %r4
+ kmctr %r2, %r4, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: kmf %r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: kmf %r2, %r1
+
+ kmf %r1, %r2
+ kmf %r2, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: kmo %r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: kmo %r2, %r1
+
+ kmo %r1, %r2
+ kmo %r2, %r1
+
#CHECK: error: invalid operand
#CHECK: laa %r0, %r0, -524289
#CHECK: error: invalid operand
@@ -757,11 +784,6 @@
locr %r0,%r0,-1
locr %r0,%r0,16
-#CHECK: error: instruction requires: execution-hint
-#CHECK: niai 0, 0
-
- niai 0, 0
-
#CHECK: error: invalid register pair
#CHECK: lpd %r1, 0, 0
#CHECK: error: invalid use of indexed addressing
@@ -802,6 +824,11 @@
lpdg %r2, 0(%r1), -1(%r15)
lpdg %r2, 0(%r1), 4096(%r15)
+#CHECK: error: instruction requires: execution-hint
+#CHECK: niai 0, 0
+
+ niai 0, 0
+
#CHECK: error: instruction requires: transactional-execution
#CHECK: ntstg %r0, 524287(%r1,%r15)
@@ -933,14 +960,6 @@
stch %r0, 524288
#CHECK: error: invalid operand
-#CHECK: sthh %r0, -524289
-#CHECK: error: invalid operand
-#CHECK: sthh %r0, 524288
-
- sthh %r0, -524289
- sthh %r0, 524288
-
-#CHECK: error: invalid operand
#CHECK: stfh %r0, -524289
#CHECK: error: invalid operand
#CHECK: stfh %r0, 524288
@@ -949,6 +968,14 @@
stfh %r0, 524288
#CHECK: error: invalid operand
+#CHECK: sthh %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: sthh %r0, 524288
+
+ sthh %r0, -524289
+ sthh %r0, 524288
+
+#CHECK: error: invalid operand
#CHECK: stoc %r0,0,-1
#CHECK: error: invalid operand
#CHECK: stoc %r0,0,16
diff --git a/test/MC/SystemZ/insn-bad-zEC12.s b/test/MC/SystemZ/insn-bad-zEC12.s
index 53dbd638e716..4bc3be3292e4 100644
--- a/test/MC/SystemZ/insn-bad-zEC12.s
+++ b/test/MC/SystemZ/insn-bad-zEC12.s
@@ -63,31 +63,6 @@
bprp 0, 0, 0x1000000
#CHECK: error: invalid operand
-#CHECK: clt %r0, -1, 0
-#CHECK: error: invalid operand
-#CHECK: clt %r0, 16, 0
-#CHECK: error: invalid operand
-#CHECK: clt %r0, 12, -524289
-#CHECK: error: invalid operand
-#CHECK: clt %r0, 12, 524288
-#CHECK: error: invalid use of indexed addressing
-#CHECK: clt %r0, 12, 0(%r1,%r2)
-
- clt %r0, -1, 0
- clt %r0, 16, 0
- clt %r0, 12, -524289
- clt %r0, 12, 524288
- clt %r0, 12, 0(%r1,%r2)
-
-#CHECK: error: invalid instruction
-#CHECK: clto %r0, 0
-#CHECK: error: invalid instruction
-#CHECK: cltno %r0, 0
-
- clto %r0, 0
- cltno %r0, 0
-
-#CHECK: error: invalid operand
#CHECK: clgt %r0, -1, 0
#CHECK: error: invalid operand
#CHECK: clgt %r0, 16, 0
@@ -105,12 +80,37 @@
clgt %r0, 12, 0(%r1,%r2)
#CHECK: error: invalid instruction
-#CHECK: clgto %r0, 0
-#CHECK: error: invalid instruction
#CHECK: clgtno %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: clgto %r0, 0
- clgto %r0, 0
clgtno %r0, 0
+ clgto %r0, 0
+
+#CHECK: error: invalid operand
+#CHECK: clt %r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: clt %r0, 16, 0
+#CHECK: error: invalid operand
+#CHECK: clt %r0, 12, -524289
+#CHECK: error: invalid operand
+#CHECK: clt %r0, 12, 524288
+#CHECK: error: invalid use of indexed addressing
+#CHECK: clt %r0, 12, 0(%r1,%r2)
+
+ clt %r0, -1, 0
+ clt %r0, 16, 0
+ clt %r0, 12, -524289
+ clt %r0, 12, 524288
+ clt %r0, 12, 0(%r1,%r2)
+
+#CHECK: error: invalid instruction
+#CHECK: cltno %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: clto %r0, 0
+
+ cltno %r0, 0
+ clto %r0, 0
#CHECK: error: invalid operand
#CHECK: lat %r0, -524289
@@ -120,6 +120,11 @@
lat %r0, -524289
lat %r0, 524288
+#CHECK: error: instruction requires: vector
+#CHECK: lcbb %r0, 0, 0
+
+ lcbb %r0, 0, 0
+
#CHECK: error: invalid operand
#CHECK: lfhat %r0, -524289
#CHECK: error: invalid operand
@@ -152,10 +157,15 @@
llgtat %r0, -524289
llgtat %r0, 524288
-#CHECK: error: instruction requires: vector
-#CHECK: lcbb %r0, 0, 0
+#CHECK: error: instruction requires: load-store-on-cond-2
+#CHECK: locghio %r11, 42
- lcbb %r0, 0, 0
+ locghio %r11, 42
+
+#CHECK: error: instruction requires: load-store-on-cond-2
+#CHECK: lochio %r11, 42
+
+ lochio %r11, 42
#CHECK: error: invalid operand
#CHECK: niai -1, 0
@@ -187,6 +197,11 @@
ppa %r0, %r0, -1
ppa %r0, %r0, 16
+#CHECK: error: instruction requires: message-security-assist-extension5
+#CHECK: ppno %r2, %r4
+
+ ppno %r2, %r4
+
#CHECK: error: invalid operand
#CHECK: risbgn %r0,%r0,0,0,-1
#CHECK: error: invalid operand
@@ -337,28 +352,28 @@
#CHECK: error: instruction requires: vector
#CHECK: vceqb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
+#CHECK: vceqbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
#CHECK: vceqf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
+#CHECK: vceqfs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
#CHECK: vceqg %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vceqh %v0, %v0, %v0
+#CHECK: vceqgs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vceqbs %v0, %v0, %v0
+#CHECK: vceqh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vceqhs %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vceqfs %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vceqgs %v0, %v0, %v0
vceqb %v0, %v0, %v0
+ vceqbs %v0, %v0, %v0
vceqf %v0, %v0, %v0
+ vceqfs %v0, %v0, %v0
vceqg %v0, %v0, %v0
+ vceqgs %v0, %v0, %v0
vceqh %v0, %v0, %v0
- vceqbs %v0, %v0, %v0
vceqhs %v0, %v0, %v0
- vceqfs %v0, %v0, %v0
- vceqgs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vcgdb %v0, %v0, 0, 0
@@ -368,54 +383,54 @@
#CHECK: error: instruction requires: vector
#CHECK: vchb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
+#CHECK: vchbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
#CHECK: vchf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
+#CHECK: vchfs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
#CHECK: vchg %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vchh %v0, %v0, %v0
+#CHECK: vchgs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vchbs %v0, %v0, %v0
+#CHECK: vchh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vchhs %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vchfs %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vchgs %v0, %v0, %v0
vchb %v0, %v0, %v0
+ vchbs %v0, %v0, %v0
vchf %v0, %v0, %v0
+ vchfs %v0, %v0, %v0
vchg %v0, %v0, %v0
+ vchgs %v0, %v0, %v0
vchh %v0, %v0, %v0
- vchbs %v0, %v0, %v0
vchhs %v0, %v0, %v0
- vchfs %v0, %v0, %v0
- vchgs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vchlb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
+#CHECK: vchlbs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
#CHECK: vchlf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
+#CHECK: vchlfs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
#CHECK: vchlg %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vchlh %v0, %v0, %v0
+#CHECK: vchlgs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vchlbs %v0, %v0, %v0
+#CHECK: vchlh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vchlhs %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vchlfs %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vchlgs %v0, %v0, %v0
vchlb %v0, %v0, %v0
+ vchlbs %v0, %v0, %v0
vchlf %v0, %v0, %v0
+ vchlfs %v0, %v0, %v0
vchlg %v0, %v0, %v0
+ vchlgs %v0, %v0, %v0
vchlh %v0, %v0, %v0
- vchlbs %v0, %v0, %v0
vchlhs %v0, %v0, %v0
- vchlfs %v0, %v0, %v0
- vchlgs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vcksm %v0, %v0, %v0
@@ -470,20 +485,6 @@
vech %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: verimb %v0, %v0, %v0, 0
-#CHECK: error: instruction requires: vector
-#CHECK: verimf %v0, %v0, %v0, 0
-#CHECK: error: instruction requires: vector
-#CHECK: verimg %v0, %v0, %v0, 0
-#CHECK: error: instruction requires: vector
-#CHECK: verimh %v0, %v0, %v0, 0
-
- verimb %v0, %v0, %v0, 0
- verimf %v0, %v0, %v0, 0
- verimg %v0, %v0, %v0, 0
- verimh %v0, %v0, %v0, 0
-
-#CHECK: error: instruction requires: vector
#CHECK: veclb %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: veclf %v0, %v0
@@ -498,18 +499,18 @@
veclh %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: verllvb %v0, %v0, %v0
+#CHECK: verimb %v0, %v0, %v0, 0
#CHECK: error: instruction requires: vector
-#CHECK: verllvf %v0, %v0, %v0
+#CHECK: verimf %v0, %v0, %v0, 0
#CHECK: error: instruction requires: vector
-#CHECK: verllvg %v0, %v0, %v0
+#CHECK: verimg %v0, %v0, %v0, 0
#CHECK: error: instruction requires: vector
-#CHECK: verllvh %v0, %v0, %v0
+#CHECK: verimh %v0, %v0, %v0, 0
- verllvb %v0, %v0, %v0
- verllvf %v0, %v0, %v0
- verllvg %v0, %v0, %v0
- verllvh %v0, %v0, %v0
+ verimb %v0, %v0, %v0, 0
+ verimf %v0, %v0, %v0, 0
+ verimg %v0, %v0, %v0, 0
+ verimh %v0, %v0, %v0, 0
#CHECK: error: instruction requires: vector
#CHECK: verllb %v0, %v0, 0
@@ -526,18 +527,18 @@
verllh %v0, %v0, 0
#CHECK: error: instruction requires: vector
-#CHECK: veslvb %v0, %v0, %v0
+#CHECK: verllvb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: veslvf %v0, %v0, %v0
+#CHECK: verllvf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: veslvg %v0, %v0, %v0
+#CHECK: verllvg %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: veslvh %v0, %v0, %v0
+#CHECK: verllvh %v0, %v0, %v0
- veslvb %v0, %v0, %v0
- veslvf %v0, %v0, %v0
- veslvg %v0, %v0, %v0
- veslvh %v0, %v0, %v0
+ verllvb %v0, %v0, %v0
+ verllvf %v0, %v0, %v0
+ verllvg %v0, %v0, %v0
+ verllvh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: veslb %v0, %v0, 0
@@ -554,18 +555,18 @@
veslh %v0, %v0, 0
#CHECK: error: instruction requires: vector
-#CHECK: vesravb %v0, %v0, %v0
+#CHECK: veslvb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vesravf %v0, %v0, %v0
+#CHECK: veslvf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vesravg %v0, %v0, %v0
+#CHECK: veslvg %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vesravh %v0, %v0, %v0
+#CHECK: veslvh %v0, %v0, %v0
- vesravb %v0, %v0, %v0
- vesravf %v0, %v0, %v0
- vesravg %v0, %v0, %v0
- vesravh %v0, %v0, %v0
+ veslvb %v0, %v0, %v0
+ veslvf %v0, %v0, %v0
+ veslvg %v0, %v0, %v0
+ veslvh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vesrab %v0, %v0, 0
@@ -582,18 +583,18 @@
vesrah %v0, %v0, 0
#CHECK: error: instruction requires: vector
-#CHECK: vesrlvb %v0, %v0, %v0
+#CHECK: vesravb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vesrlvf %v0, %v0, %v0
+#CHECK: vesravf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vesrlvg %v0, %v0, %v0
+#CHECK: vesravg %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vesrlvh %v0, %v0, %v0
+#CHECK: vesravh %v0, %v0, %v0
- vesrlvb %v0, %v0, %v0
- vesrlvf %v0, %v0, %v0
- vesrlvg %v0, %v0, %v0
- vesrlvh %v0, %v0, %v0
+ vesravb %v0, %v0, %v0
+ vesravf %v0, %v0, %v0
+ vesravg %v0, %v0, %v0
+ vesravh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vesrlb %v0, %v0, 0
@@ -610,142 +611,156 @@
vesrlh %v0, %v0, 0
#CHECK: error: instruction requires: vector
-#CHECK: vfadb %v0, %v0, %v0
-
- vfadb %v0, %v0, %v0
-
+#CHECK: vesrlvb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfcedb %v0, %v0, %v0
-#CHECK: vfcedbs %v0, %v0, %v0
-
- vfcedb %v0, %v0, %v0
- vfcedbs %v0, %v0, %v0
-
+#CHECK: vesrlvf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfchdb %v0, %v0, %v0
-#CHECK: vfchdbs %v0, %v0, %v0
+#CHECK: vesrlvg %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vesrlvh %v0, %v0, %v0
- vfchdb %v0, %v0, %v0
- vfchdbs %v0, %v0, %v0
+ vesrlvb %v0, %v0, %v0
+ vesrlvf %v0, %v0, %v0
+ vesrlvg %v0, %v0, %v0
+ vesrlvh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfddb %v0, %v0, %v0
+#CHECK: vfadb %v0, %v0, %v0
- vfddb %v0, %v0, %v0
+ vfadb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vfaeb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfaezb %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
#CHECK: vfaebs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfaezbs %v0, %v0, %v0
+#CHECK: vfaef %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfaeh %v0, %v0, %v0
+#CHECK: vfaefs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfaezh %v0, %v0, %v0
+#CHECK: vfaeh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vfaehs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfaezhs %v0, %v0, %v0
+#CHECK: vfaezb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfaef %v0, %v0, %v0
+#CHECK: vfaezbs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vfaezf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfaefs %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
#CHECK: vfaezfs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfaezh %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfaezhs %v0, %v0, %v0
vfaeb %v0, %v0, %v0
- vfaezb %v0, %v0, %v0
vfaebs %v0, %v0, %v0
- vfaezbs %v0, %v0, %v0
+ vfaef %v0, %v0, %v0
+ vfaefs %v0, %v0, %v0
vfaeh %v0, %v0, %v0
- vfaezh %v0, %v0, %v0
vfaehs %v0, %v0, %v0
- vfaezhs %v0, %v0, %v0
- vfaef %v0, %v0, %v0
+ vfaezb %v0, %v0, %v0
+ vfaezbs %v0, %v0, %v0
vfaezf %v0, %v0, %v0
- vfaefs %v0, %v0, %v0
vfaezfs %v0, %v0, %v0
+ vfaezh %v0, %v0, %v0
+ vfaezhs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfeeb %v0, %v0, %v0
+#CHECK: vfcedb %v0, %v0, %v0
+#CHECK: vfcedbs %v0, %v0, %v0
+
+ vfcedb %v0, %v0, %v0
+ vfcedbs %v0, %v0, %v0
+
#CHECK: error: instruction requires: vector
-#CHECK: vfeezb %v0, %v0, %v0
+#CHECK: vfchdb %v0, %v0, %v0
+#CHECK: vfchdbs %v0, %v0, %v0
+
+ vfchdb %v0, %v0, %v0
+ vfchdbs %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector
+#CHECK: vfddb %v0, %v0, %v0
+
+ vfddb %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector
+#CHECK: vfeeb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vfeebs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfeezbs %v0, %v0, %v0
+#CHECK: vfeef %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfeeh %v0, %v0, %v0
+#CHECK: vfeefs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfeezh %v0, %v0, %v0
+#CHECK: vfeeh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vfeehs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfeezhs %v0, %v0, %v0
+#CHECK: vfeezb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfeef %v0, %v0, %v0
+#CHECK: vfeezbs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vfeezf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfeefs %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
#CHECK: vfeezfs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfeezh %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfeezhs %v0, %v0, %v0
vfeeb %v0, %v0, %v0
- vfeezb %v0, %v0, %v0
vfeebs %v0, %v0, %v0
- vfeezbs %v0, %v0, %v0
+ vfeef %v0, %v0, %v0
+ vfeefs %v0, %v0, %v0
vfeeh %v0, %v0, %v0
- vfeezh %v0, %v0, %v0
vfeehs %v0, %v0, %v0
- vfeezhs %v0, %v0, %v0
- vfeef %v0, %v0, %v0
+ vfeezb %v0, %v0, %v0
+ vfeezbs %v0, %v0, %v0
vfeezf %v0, %v0, %v0
- vfeefs %v0, %v0, %v0
vfeezfs %v0, %v0, %v0
+ vfeezh %v0, %v0, %v0
+ vfeezhs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vfeneb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfenezb %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
#CHECK: vfenebs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfenezbs %v0, %v0, %v0
+#CHECK: vfenef %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfeneh %v0, %v0, %v0
+#CHECK: vfenefs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfenezh %v0, %v0, %v0
+#CHECK: vfeneh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vfenehs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfenezhs %v0, %v0, %v0
+#CHECK: vfenezb %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfenef %v0, %v0, %v0
+#CHECK: vfenezbs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vfenezf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vfenefs %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
#CHECK: vfenezfs %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfenezh %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vfenezhs %v0, %v0, %v0
vfeneb %v0, %v0, %v0
- vfenezb %v0, %v0, %v0
vfenebs %v0, %v0, %v0
- vfenezbs %v0, %v0, %v0
+ vfenef %v0, %v0, %v0
+ vfenefs %v0, %v0, %v0
vfeneh %v0, %v0, %v0
- vfenezh %v0, %v0, %v0
vfenehs %v0, %v0, %v0
- vfenezhs %v0, %v0, %v0
- vfenef %v0, %v0, %v0
+ vfenezb %v0, %v0, %v0
+ vfenezbs %v0, %v0, %v0
vfenezf %v0, %v0, %v0
- vfenefs %v0, %v0, %v0
vfenezfs %v0, %v0, %v0
+ vfenezh %v0, %v0, %v0
+ vfenezhs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vfidb %v0, %v0, 0, 0
@@ -753,26 +768,6 @@
vfidb %v0, %v0, 0, 0
#CHECK: error: instruction requires: vector
-#CHECK: vistrb %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vistrbs %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vistrh %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vistrhs %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vistrf %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vistrfs %v0, %v0
-
- vistrb %v0, %v0
- vistrbs %v0, %v0
- vistrh %v0, %v0
- vistrhs %v0, %v0
- vistrf %v0, %v0
- vistrfs %v0, %v0
-
-#CHECK: error: instruction requires: vector
#CHECK: vflcdb %v0, %v0
vflcdb %v0, %v0
@@ -873,6 +868,26 @@
vgmh %v0, 0, 0
#CHECK: error: instruction requires: vector
+#CHECK: vistrb %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vistrbs %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vistrf %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vistrfs %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vistrh %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vistrhs %v0, %v0
+
+ vistrb %v0, %v0
+ vistrbs %v0, %v0
+ vistrf %v0, %v0
+ vistrfs %v0, %v0
+ vistrh %v0, %v0
+ vistrhs %v0, %v0
+
+#CHECK: error: instruction requires: vector
#CHECK: vl %v0, 0
vl %v0, 0
@@ -1309,44 +1324,44 @@
vpkh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpksf %v0, %v0, %v0
+#CHECK: vpklsf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpksg %v0, %v0, %v0
+#CHECK: vpklsfs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpksh %v0, %v0, %v0
+#CHECK: vpklsg %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpksfs %v0, %v0, %v0
+#CHECK: vpklsgs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpksgs %v0, %v0, %v0
+#CHECK: vpklsh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpkshs %v0, %v0, %v0
+#CHECK: vpklshs %v0, %v0, %v0
- vpksf %v0, %v0, %v0
- vpksg %v0, %v0, %v0
- vpksh %v0, %v0, %v0
- vpksfs %v0, %v0, %v0
- vpksgs %v0, %v0, %v0
- vpkshs %v0, %v0, %v0
+ vpklsf %v0, %v0, %v0
+ vpklsfs %v0, %v0, %v0
+ vpklsg %v0, %v0, %v0
+ vpklsgs %v0, %v0, %v0
+ vpklsh %v0, %v0, %v0
+ vpklshs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpklsf %v0, %v0, %v0
+#CHECK: vpksf %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpklsg %v0, %v0, %v0
+#CHECK: vpksfs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpklsh %v0, %v0, %v0
+#CHECK: vpksg %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpklsfs %v0, %v0, %v0
+#CHECK: vpksgs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpklsgs %v0, %v0, %v0
+#CHECK: vpksh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vpklshs %v0, %v0, %v0
+#CHECK: vpkshs %v0, %v0, %v0
- vpklsf %v0, %v0, %v0
- vpklsg %v0, %v0, %v0
- vpklsh %v0, %v0, %v0
- vpklsfs %v0, %v0, %v0
- vpklsgs %v0, %v0, %v0
- vpklshs %v0, %v0, %v0
+ vpksf %v0, %v0, %v0
+ vpksfs %v0, %v0, %v0
+ vpksg %v0, %v0, %v0
+ vpksgs %v0, %v0, %v0
+ vpksh %v0, %v0, %v0
+ vpkshs %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vpopct %v0, %v0, 0
@@ -1502,40 +1517,48 @@
#CHECK: error: instruction requires: vector
#CHECK: vstrcb %v0, %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vstrczb %v0, %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
#CHECK: vstrcbs %v0, %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vstrczbs %v0, %v0, %v0, %v0
+#CHECK: vstrcf %v0, %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vstrch %v0, %v0, %v0, %v0
+#CHECK: vstrcfs %v0, %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vstrczh %v0, %v0, %v0, %v0
+#CHECK: vstrch %v0, %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vstrchs %v0, %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vstrczhs %v0, %v0, %v0, %v0
+#CHECK: vstrczb %v0, %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vstrcf %v0, %v0, %v0, %v0
+#CHECK: vstrczbs %v0, %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vstrczf %v0, %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vstrcfs %v0, %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
#CHECK: vstrczfs %v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vstrczh %v0, %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vstrczhs %v0, %v0, %v0, %v0
vstrcb %v0, %v0, %v0, %v0
- vstrczb %v0, %v0, %v0, %v0
vstrcbs %v0, %v0, %v0, %v0
- vstrczbs %v0, %v0, %v0, %v0
+ vstrcf %v0, %v0, %v0, %v0
+ vstrcfs %v0, %v0, %v0, %v0
vstrch %v0, %v0, %v0, %v0
- vstrczh %v0, %v0, %v0, %v0
vstrchs %v0, %v0, %v0, %v0
- vstrczhs %v0, %v0, %v0, %v0
- vstrcf %v0, %v0, %v0, %v0
+ vstrczb %v0, %v0, %v0, %v0
+ vstrczbs %v0, %v0, %v0, %v0
vstrczf %v0, %v0, %v0, %v0
- vstrcfs %v0, %v0, %v0, %v0
vstrczfs %v0, %v0, %v0, %v0
+ vstrczh %v0, %v0, %v0, %v0
+ vstrczhs %v0, %v0, %v0, %v0
+
+#CHECK: error: instruction requires: vector
+#CHECK: vsumb %v0, %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vsumh %v0, %v0, %v0
+
+ vsumb %v0, %v0, %v0
+ vsumh %v0, %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vsumgh %v0, %v0, %v0
@@ -1554,14 +1577,6 @@
vsumqg %v0, %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vsumb %v0, %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vsumh %v0, %v0, %v0
-
- vsumb %v0, %v0, %v0
- vsumh %v0, %v0, %v0
-
-#CHECK: error: instruction requires: vector
#CHECK: vtm %v0, %v0
vtm %v0, %v0
@@ -1578,17 +1593,6 @@
vuphh %v0, %v0
#CHECK: error: instruction requires: vector
-#CHECK: vuplhb %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vuplhf %v0, %v0
-#CHECK: error: instruction requires: vector
-#CHECK: vuplhh %v0, %v0
-
- vuplhb %v0, %v0
- vuplhf %v0, %v0
- vuplhh %v0, %v0
-
-#CHECK: error: instruction requires: vector
#CHECK: vuplb %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vuplf %v0, %v0
@@ -1600,6 +1604,17 @@
vuplhw %v0, %v0
#CHECK: error: instruction requires: vector
+#CHECK: vuplhb %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vuplhf %v0, %v0
+#CHECK: error: instruction requires: vector
+#CHECK: vuplhh %v0, %v0
+
+ vuplhb %v0, %v0
+ vuplhf %v0, %v0
+ vuplhh %v0, %v0
+
+#CHECK: error: instruction requires: vector
#CHECK: vupllb %v0, %v0
#CHECK: error: instruction requires: vector
#CHECK: vupllf %v0, %v0
@@ -1741,13 +1756,3 @@
wledb %v0, %v0, 0, 0
-#CHECK: error: instruction requires: load-store-on-cond-2
-#CHECK: lochio %r11, 42
-
- lochio %r11, 42
-
-#CHECK: error: instruction requires: load-store-on-cond-2
-#CHECK: locghio %r11, 42
-
- locghio %r11, 42
-
diff --git a/test/MC/SystemZ/insn-bad.s b/test/MC/SystemZ/insn-bad.s
index 018070a74dfc..b96c661ae3da 100644
--- a/test/MC/SystemZ/insn-bad.s
+++ b/test/MC/SystemZ/insn-bad.s
@@ -167,16 +167,6 @@
alfi %r0, -1
alfi %r0, (1 << 32)
-#CHECK: error: instruction requires: distinct-ops
-#CHECK: alghsik %r1, %r2, 3
-
- alghsik %r1, %r2, 3
-
-#CHECK: error: instruction requires: distinct-ops
-#CHECK: alhsik %r1, %r2, 3
-
- alhsik %r1, %r2, 3
-
#CHECK: error: invalid operand
#CHECK: alg %r0, -524289
#CHECK: error: invalid operand
@@ -202,16 +192,60 @@
algfi %r0, (1 << 32)
#CHECK: error: instruction requires: distinct-ops
+#CHECK: alghsik %r1, %r2, 3
+
+ alghsik %r1, %r2, 3
+
+#CHECK: error: instruction requires: distinct-ops
#CHECK: algrk %r2,%r3,%r4
algrk %r2,%r3,%r4
#CHECK: error: instruction requires: distinct-ops
+#CHECK: alhsik %r1, %r2, 3
+
+ alhsik %r1, %r2, 3
+
+#CHECK: error: instruction requires: distinct-ops
#CHECK: alrk %r2,%r3,%r4
alrk %r2,%r3,%r4
#CHECK: error: invalid operand
+#CHECK: algsi -524289, 0
+#CHECK: error: invalid operand
+#CHECK: algsi 524288, 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: algsi 0(%r1,%r2), 0
+#CHECK: error: invalid operand
+#CHECK: algsi 0, -129
+#CHECK: error: invalid operand
+#CHECK: algsi 0, 128
+
+ algsi -524289, 0
+ algsi 524288, 0
+ algsi 0(%r1,%r2), 0
+ algsi 0, -129
+ algsi 0, 128
+
+#CHECK: error: invalid operand
+#CHECK: alsi -524289, 0
+#CHECK: error: invalid operand
+#CHECK: alsi 524288, 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: alsi 0(%r1,%r2), 0
+#CHECK: error: invalid operand
+#CHECK: alsi 0, -129
+#CHECK: error: invalid operand
+#CHECK: alsi 0, 128
+
+ alsi -524289, 0
+ alsi 524288, 0
+ alsi 0(%r1,%r2), 0
+ alsi 0, -129
+ alsi 0, 128
+
+#CHECK: error: invalid operand
#CHECK: aly %r0, -524289
#CHECK: error: invalid operand
#CHECK: aly %r0, 524288
@@ -219,6 +253,59 @@
aly %r0, -524289
aly %r0, 524288
+#CHECK: error: missing length in address
+#CHECK: ap 0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: ap 0(1), 0
+#CHECK: error: missing length in address
+#CHECK: ap 0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: ap 0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: ap 0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap 0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap 0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap 0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap -1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap 4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap 0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: ap 0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: ap 0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: ap 0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: ap 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: ap 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: ap 0(-), 0(1)
+
+ ap 0, 0(1)
+ ap 0(1), 0
+ ap 0(%r1), 0(1,%r1)
+ ap 0(1,%r1), 0(%r1)
+ ap 0(0,%r1), 0(1,%r1)
+ ap 0(1,%r1), 0(0,%r1)
+ ap 0(17,%r1), 0(1,%r1)
+ ap 0(1,%r1), 0(17,%r1)
+ ap -1(1,%r1), 0(1,%r1)
+ ap 4096(1,%r1), 0(1,%r1)
+ ap 0(1,%r1), -1(1,%r1)
+ ap 0(1,%r1), 4096(1,%r1)
+ ap 0(1,%r0), 0(1,%r1)
+ ap 0(1,%r1), 0(1,%r0)
+ ap 0(%r1,%r2), 0(1,%r1)
+ ap 0(1,%r2), 0(%r1,%r2)
+ ap 0(-), 0(1)
+
#CHECK: error: instruction requires: distinct-ops
#CHECK: ark %r2,%r3,%r4
@@ -296,6 +383,22 @@
bcr -1, %r1
bcr 16, %r1
+#CHECK: error: invalid operand
+#CHECK: bct %r0, -1
+#CHECK: error: invalid operand
+#CHECK: bct %r0, 4096
+
+ bct %r0, -1
+ bct %r0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: bctg %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: bctg %r0, 524288
+
+ bctg %r0, -524289
+ bctg %r0, 524288
+
#CHECK: error: offset out of range
#CHECK: bras %r0, -0x100002
#CHECK: error: offset out of range
@@ -374,22 +477,6 @@
brcl -1, bar
brcl 16, bar
-#CHECK: error: invalid operand
-#CHECK: bct %r0, -1
-#CHECK: error: invalid operand
-#CHECK: bct %r0, 4096
-
- bct %r0, -1
- bct %r0, 4096
-
-#CHECK: error: invalid operand
-#CHECK: bctg %r0, -524289
-#CHECK: error: invalid operand
-#CHECK: bctg %r0, 524288
-
- bctg %r0, -524289
- bctg %r0, 524288
-
#CHECK: error: offset out of range
#CHECK: brct %r0, -0x100002
#CHECK: error: offset out of range
@@ -423,25 +510,6 @@
brcth %r0, 0
-#CHECK: error: invalid operand
-#CHECK: bxh %r0, %r0, 4096
-#CHECK: error: invalid use of indexed addressing
-#CHECK: bxh %r0, %r0, 0(%r1,%r2)
-
- bxh %r0, %r0, 4096
- bxh %r0, %r0, 0(%r1,%r2)
-
-#CHECK: error: invalid operand
-#CHECK: bxhg %r0, %r0, -524289
-#CHECK: error: invalid operand
-#CHECK: bxhg %r0, %r0, 524288
-#CHECK: error: invalid use of indexed addressing
-#CHECK: bxhg %r0, %r0, 0(%r1,%r2)
-
- bxhg %r0, %r0, -524289
- bxhg %r0, %r0, 524288
- bxhg %r0, %r0, 0(%r1,%r2)
-
#CHECK: error: offset out of range
#CHECK: brxh %r0, %r2, -0x100002
#CHECK: error: offset out of range
@@ -470,25 +538,6 @@
brxhg %r0, %r2, 1
brxhg %r0, %r2, 0x10000
-#CHECK: error: invalid operand
-#CHECK: bxle %r0, %r0, 4096
-#CHECK: error: invalid use of indexed addressing
-#CHECK: bxle %r0, %r0, 0(%r1,%r2)
-
- bxle %r0, %r0, 4096
- bxle %r0, %r0, 0(%r1,%r2)
-
-#CHECK: error: invalid operand
-#CHECK: bxhg %r0, %r0, -524289
-#CHECK: error: invalid operand
-#CHECK: bxhg %r0, %r0, 524288
-#CHECK: error: invalid use of indexed addressing
-#CHECK: bxhg %r0, %r0, 0(%r1,%r2)
-
- bxhg %r0, %r0, -524289
- bxhg %r0, %r0, 524288
- bxhg %r0, %r0, 0(%r1,%r2)
-
#CHECK: error: offset out of range
#CHECK: brxle %r0, %r2, -0x100002
#CHECK: error: offset out of range
@@ -518,6 +567,44 @@
brxlg %r0, %r2, 0x10000
#CHECK: error: invalid operand
+#CHECK: bxh %r0, %r0, 4096
+#CHECK: error: invalid use of indexed addressing
+#CHECK: bxh %r0, %r0, 0(%r1,%r2)
+
+ bxh %r0, %r0, 4096
+ bxh %r0, %r0, 0(%r1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: bxhg %r0, %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: bxhg %r0, %r0, 524288
+#CHECK: error: invalid use of indexed addressing
+#CHECK: bxhg %r0, %r0, 0(%r1,%r2)
+
+ bxhg %r0, %r0, -524289
+ bxhg %r0, %r0, 524288
+ bxhg %r0, %r0, 0(%r1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: bxle %r0, %r0, 4096
+#CHECK: error: invalid use of indexed addressing
+#CHECK: bxle %r0, %r0, 0(%r1,%r2)
+
+ bxle %r0, %r0, 4096
+ bxle %r0, %r0, 0(%r1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: bxleg %r0, %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: bxleg %r0, %r0, 524288
+#CHECK: error: invalid use of indexed addressing
+#CHECK: bxleg %r0, %r0, 0(%r1,%r2)
+
+ bxleg %r0, %r0, -524289
+ bxleg %r0, %r0, 524288
+ bxleg %r0, %r0, 0(%r1,%r2)
+
+#CHECK: error: invalid operand
#CHECK: c %r0, -1
#CHECK: error: invalid operand
#CHECK: c %r0, 4096
@@ -633,6 +720,17 @@
celgbr %f0, 0, %r0, 0
#CHECK: error: invalid operand
+#CHECK: cfc -1
+#CHECK: error: invalid operand
+#CHECK: cfc 4096
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cfc 0(%r1,%r2)
+
+ cfc -1
+ cfc 4096
+ cfc 0(%r1,%r2)
+
+#CHECK: error: invalid operand
#CHECK: cfdbr %r0, -1, %f0
#CHECK: error: invalid operand
#CHECK: cfdbr %r0, 16, %f0
@@ -819,26 +917,26 @@
cgij %r0, 0, 0, 0x10000
#CHECK: error: invalid instruction
-#CHECK: cgijo %r0, 0, 0, 0
-#CHECK: error: invalid instruction
#CHECK: cgijno %r0, 0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK: cgijo %r0, 0, 0, 0
- cgijo %r0, 0, 0, 0
cgijno %r0, 0, 0, 0
+ cgijo %r0, 0, 0, 0
#CHECK: error: invalid operand
#CHECK: cgit %r0, -32769
#CHECK: error: invalid operand
#CHECK: cgit %r0, 32768
#CHECK: error: invalid instruction
-#CHECK: cgito %r0, 0
-#CHECK: error: invalid instruction
#CHECK: cgitno %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: cgito %r0, 0
cgit %r0, -32769
cgit %r0, 32768
- cgito %r0, 0
cgitno %r0, 0
+ cgito %r0, 0
#CHECK: error: offset out of range
#CHECK: cgrj %r0, %r0, 0, -0x100002
@@ -855,12 +953,12 @@
cgrj %r0, %r0, 0, 0x10000
#CHECK: error: invalid instruction
-#CHECK: cgrjo %r0, %r0, 0, 0
-#CHECK: error: invalid instruction
#CHECK: cgrjno %r0, %r0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK: cgrjo %r0, %r0, 0, 0
- cgrjo %r0, %r0, 0, 0
cgrjno %r0, %r0, 0, 0
+ cgrjo %r0, %r0, 0, 0
#CHECK: error: offset out of range
#CHECK: cgrl %r0, -0x1000000002
@@ -877,12 +975,12 @@
cgrl %r0, 0x100000000
#CHECK: error: invalid instruction
-#CHECK: cgrto %r0, %r0
-#CHECK: error: invalid instruction
#CHECK: cgrtno %r0, %r0
+#CHECK: error: invalid instruction
+#CHECK: cgrto %r0, %r0
- cgrto %r0, %r0
cgrtno %r0, %r0
+ cgrto %r0, %r0
#CHECK: error: invalid operand
#CHECK: cgxbr %r0, -1, %f0
@@ -1008,26 +1106,31 @@
cij %r0, 0, 0, 0x10000
#CHECK: error: invalid instruction
-#CHECK: cijo %r0, 0, 0, 0
-#CHECK: error: invalid instruction
#CHECK: cijno %r0, 0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK: cijo %r0, 0, 0, 0
- cijo %r0, 0, 0, 0
cijno %r0, 0, 0, 0
+ cijo %r0, 0, 0, 0
#CHECK: error: invalid operand
#CHECK: cit %r0, -32769
#CHECK: error: invalid operand
#CHECK: cit %r0, 32768
#CHECK: error: invalid instruction
-#CHECK: cito %r0, 0
-#CHECK: error: invalid instruction
#CHECK: citno %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: cito %r0, 0
cit %r0, -32769
cit %r0, 32768
- cito %r0, 0
citno %r0, 0
+ cito %r0, 0
+
+#CHECK: error: invalid register pair
+#CHECK: cksm %r0, %r1
+
+ cksm %r0, %r1
#CHECK: error: invalid operand
#CHECK: cl %r0, -1
@@ -1081,10 +1184,41 @@
clc 0(1,%r2), 0(%r1,%r2)
clc 0(-), 0
-#CHECK: error: instruction requires: high-word
-#CHECK: clhf %r0, 0
+#CHECK: error: invalid register pair
+#CHECK: clcl %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: clcl %r0, %r1
- clhf %r0, 0
+ clcl %r1, %r0
+ clcl %r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: clcle %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: clcle %r0, %r1
+#CHECK: error: invalid operand
+#CHECK: clcle %r0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: clcle %r0, %r0, 4096
+
+ clcle %r1, %r0, 0
+ clcle %r0, %r1, 0
+ clcle %r0, %r0, -1
+ clcle %r0, %r0, 4096
+
+#CHECK: error: invalid register pair
+#CHECK: clclu %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: clclu %r0, %r1
+#CHECK: error: invalid operand
+#CHECK: clclu %r0, %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: clclu %r0, %r0, 524288
+
+ clclu %r1, %r0, 0
+ clclu %r0, %r1, 0
+ clclu %r0, %r0, -524289
+ clclu %r0, %r0, 524288
#CHECK: error: instruction requires: fp-extension
#CHECK: clfdbr %r0, 0, %f0, 0
@@ -1126,14 +1260,14 @@
#CHECK: error: invalid operand
#CHECK: clfit %r0, 65536
#CHECK: error: invalid instruction
-#CHECK: clfito %r0, 0
-#CHECK: error: invalid instruction
#CHECK: clfitno %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: clfito %r0, 0
clfit %r0, -1
clfit %r0, 65536
- clfito %r0, 0
clfitno %r0, 0
+ clfito %r0, 0
#CHECK: error: instruction requires: fp-extension
#CHECK: clfxbr %r0, 0, %f0, 0
@@ -1148,20 +1282,6 @@
clg %r0, -524289
clg %r0, 524288
-#CHECK: error: invalid operand
-#CHECK: clgit %r0, -1
-#CHECK: error: invalid operand
-#CHECK: clgit %r0, 65536
-#CHECK: error: invalid instruction
-#CHECK: clgito %r0, 0
-#CHECK: error: invalid instruction
-#CHECK: clgitno %r0, 0
-
- clgit %r0, -1
- clgit %r0, 65536
- clgito %r0, 0
- clgitno %r0, 0
-
#CHECK: error: instruction requires: fp-extension
#CHECK: clgdbr %r0, 0, %f0, 0
@@ -1256,12 +1376,26 @@
clgij %r0, 0, 0, 0x10000
#CHECK: error: invalid instruction
-#CHECK: clgijo %r0, 0, 0, 0
-#CHECK: error: invalid instruction
#CHECK: clgijno %r0, 0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK: clgijo %r0, 0, 0, 0
- clgijo %r0, 0, 0, 0
clgijno %r0, 0, 0, 0
+ clgijo %r0, 0, 0, 0
+
+#CHECK: error: invalid operand
+#CHECK: clgit %r0, -1
+#CHECK: error: invalid operand
+#CHECK: clgit %r0, 65536
+#CHECK: error: invalid instruction
+#CHECK: clgitno %r0, 0
+#CHECK: error: invalid instruction
+#CHECK: clgito %r0, 0
+
+ clgit %r0, -1
+ clgit %r0, 65536
+ clgitno %r0, 0
+ clgito %r0, 0
#CHECK: error: offset out of range
#CHECK: clgrj %r0, %r0, 0, -0x100002
@@ -1292,18 +1426,23 @@
clgrl %r0, 0x100000000
#CHECK: error: invalid instruction
-#CHECK: clgrto %r0, %r0
-#CHECK: error: invalid instruction
#CHECK: clgrtno %r0, %r0
+#CHECK: error: invalid instruction
+#CHECK: clgrto %r0, %r0
- clgrto %r0, %r0
clgrtno %r0, %r0
+ clgrto %r0, %r0
#CHECK: error: instruction requires: fp-extension
#CHECK: clgxbr %r0, 0, %f0, 0
clgxbr %r0, 0, %f0, 0
+#CHECK: error: instruction requires: high-word
+#CHECK: clhf %r0, 0
+
+ clhf %r0, 0
+
#CHECK: error: invalid operand
#CHECK: clhhsi -1, 0
#CHECK: error: invalid operand
@@ -1380,12 +1519,12 @@
clij %r0, 0, 0, 0x10000
#CHECK: error: invalid instruction
-#CHECK: clijo %r0, 0, 0, 0
-#CHECK: error: invalid instruction
#CHECK: clijno %r0, 0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK: clijo %r0, 0, 0, 0
- clijo %r0, 0, 0, 0
clijno %r0, 0, 0, 0
+ clijo %r0, 0, 0, 0
#CHECK: error: invalid operand
#CHECK: cliy -524289, 0
@@ -1404,6 +1543,48 @@
cliy 0, -1
cliy 0, 256
+#CHECK: error: invalid operand
+#CHECK: clm %r0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: clm %r0, 0, 4096
+#CHECK: error: invalid operand
+#CHECK: clm %r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: clm %r0, 16, 0
+
+ clm %r0, 0, -1
+ clm %r0, 0, 4096
+ clm %r0, -1, 0
+ clm %r0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: clmh %r0, 0, -524289
+#CHECK: error: invalid operand
+#CHECK: clmh %r0, 0, 524288
+#CHECK: error: invalid operand
+#CHECK: clmh %r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: clmh %r0, 16, 0
+
+ clmh %r0, 0, -524289
+ clmh %r0, 0, 524288
+ clmh %r0, -1, 0
+ clmh %r0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: clmy %r0, 0, -524289
+#CHECK: error: invalid operand
+#CHECK: clmy %r0, 0, 524288
+#CHECK: error: invalid operand
+#CHECK: clmy %r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: clmy %r0, 16, 0
+
+ clmy %r0, 0, -524289
+ clmy %r0, 0, 524288
+ clmy %r0, -1, 0
+ clmy %r0, 16, 0
+
#CHECK: error: offset out of range
#CHECK: clrj %r0, %r0, 0, -0x100002
#CHECK: error: offset out of range
@@ -1419,12 +1600,12 @@
clrj %r0, %r0, 0, 0x10000
#CHECK: error: invalid instruction
-#CHECK: clrjo %r0, %r0, 0, 0
-#CHECK: error: invalid instruction
#CHECK: clrjno %r0, %r0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK: clrjo %r0, %r0, 0, 0
- clrjo %r0, %r0, 0, 0
clrjno %r0, %r0, 0, 0
+ clrjo %r0, %r0, 0, 0
#CHECK: error: offset out of range
#CHECK: clrl %r0, -0x1000000002
@@ -1441,12 +1622,12 @@
clrl %r0, 0x100000000
#CHECK: error: invalid instruction
-#CHECK: clrto %r0, %r0
-#CHECK: error: invalid instruction
#CHECK: clrtno %r0, %r0
+#CHECK: error: invalid instruction
+#CHECK: clrto %r0, %r0
- clrto %r0, %r0
clrtno %r0, %r0
+ clrto %r0, %r0
#CHECK: error: invalid operand
#CHECK: cly %r0, -524289
@@ -1456,6 +1637,67 @@
cly %r0, -524289
cly %r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: cmpsc %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cmpsc %r0, %r1
+
+ cmpsc %r1, %r0
+ cmpsc %r0, %r1
+
+#CHECK: error: missing length in address
+#CHECK: cp 0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: cp 0(1), 0
+#CHECK: error: missing length in address
+#CHECK: cp 0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: cp 0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: cp 0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp 0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp 0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp 0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp -1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp 4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp 0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: cp 0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: cp 0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: cp 0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cp 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cp 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: cp 0(-), 0(1)
+
+ cp 0, 0(1)
+ cp 0(1), 0
+ cp 0(%r1), 0(1,%r1)
+ cp 0(1,%r1), 0(%r1)
+ cp 0(0,%r1), 0(1,%r1)
+ cp 0(1,%r1), 0(0,%r1)
+ cp 0(17,%r1), 0(1,%r1)
+ cp 0(1,%r1), 0(17,%r1)
+ cp -1(1,%r1), 0(1,%r1)
+ cp 4096(1,%r1), 0(1,%r1)
+ cp 0(1,%r1), -1(1,%r1)
+ cp 0(1,%r1), 4096(1,%r1)
+ cp 0(1,%r0), 0(1,%r1)
+ cp 0(1,%r1), 0(1,%r0)
+ cp 0(%r1,%r2), 0(1,%r1)
+ cp 0(1,%r2), 0(%r1,%r2)
+ cp 0(-), 0(1)
+
#CHECK: error: offset out of range
#CHECK: crj %r0, %r0, 0, -0x100002
#CHECK: error: offset out of range
@@ -1471,12 +1713,12 @@
crj %r0, %r0, 0, 0x10000
#CHECK: error: invalid instruction
-#CHECK: crjo %r0, %r0, 0, 0
-#CHECK: error: invalid instruction
#CHECK: crjno %r0, %r0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK: crjo %r0, %r0, 0, 0
- crjo %r0, %r0, 0, 0
crjno %r0, %r0, 0, 0
+ crjo %r0, %r0, 0, 0
#CHECK: error: offset out of range
#CHECK: crl %r0, -0x1000000002
@@ -1493,12 +1735,12 @@
crl %r0, 0x100000000
#CHECK: error: invalid instruction
-#CHECK: crto %r0, %r0
-#CHECK: error: invalid instruction
#CHECK: crtno %r0, %r0
+#CHECK: error: invalid instruction
+#CHECK: crto %r0, %r0
- crto %r0, %r0
crtno %r0, %r0
+ crto %r0, %r0
#CHECK: error: invalid operand
#CHECK: cs %r0, %r0, -1
@@ -1522,17 +1764,6 @@
csg %r0, %r0, 524288
csg %r0, %r0, 0(%r1,%r2)
-#CHECK: error: invalid operand
-#CHECK: csy %r0, %r0, -524289
-#CHECK: error: invalid operand
-#CHECK: csy %r0, %r0, 524288
-#CHECK: error: invalid use of indexed addressing
-#CHECK: csy %r0, %r0, 0(%r1,%r2)
-
- csy %r0, %r0, -524289
- csy %r0, %r0, 524288
- csy %r0, %r0, 0(%r1,%r2)
-
#CHECK: error: invalid use of indexed addressing
#CHECK: csst 160(%r1,%r15), 160(%r15), %r2
#CHECK: error: invalid operand
@@ -1550,6 +1781,173 @@
csst 0(%r1), -1(%r15), %r2
csst 0(%r1), 4096(%r15), %r2
+#CHECK: error: invalid operand
+#CHECK: csy %r0, %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: csy %r0, %r0, 524288
+#CHECK: error: invalid use of indexed addressing
+#CHECK: csy %r0, %r0, 0(%r1,%r2)
+
+ csy %r0, %r0, -524289
+ csy %r0, %r0, 524288
+ csy %r0, %r0, 0(%r1,%r2)
+
+#CHECK: error: invalid register pair
+#CHECK: cu12 %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu12 %r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cu12 %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cu12 %r2, %r4, 16
+
+ cu12 %r1, %r0
+ cu12 %r0, %r1
+ cu12 %r2, %r4, -1
+ cu12 %r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: cu14 %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu14 %r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cu14 %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cu14 %r2, %r4, 16
+
+ cu14 %r1, %r0
+ cu14 %r0, %r1
+ cu14 %r2, %r4, -1
+ cu14 %r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: cu21 %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu21 %r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cu21 %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cu21 %r2, %r4, 16
+
+ cu21 %r1, %r0
+ cu21 %r0, %r1
+ cu21 %r2, %r4, -1
+ cu21 %r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: cu24 %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu24 %r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cu24 %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cu24 %r2, %r4, 16
+
+ cu24 %r1, %r0
+ cu24 %r0, %r1
+ cu24 %r2, %r4, -1
+ cu24 %r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: cu41 %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu41 %r0, %r1
+
+ cu41 %r1, %r0
+ cu41 %r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: cu42 %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cu42 %r0, %r1
+
+ cu42 %r1, %r0
+ cu42 %r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: cuse %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cuse %r0, %r1
+
+ cuse %r1, %r0
+ cuse %r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: cutfu %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cutfu %r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cutfu %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cutfu %r2, %r4, 16
+
+ cutfu %r1, %r0
+ cutfu %r0, %r1
+ cutfu %r2, %r4, -1
+ cutfu %r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: cuutf %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: cuutf %r0, %r1
+#CHECK: error: invalid operand
+#CHECK: cuutf %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: cuutf %r2, %r4, 16
+
+ cuutf %r1, %r0
+ cuutf %r0, %r1
+ cuutf %r2, %r4, -1
+ cuutf %r2, %r4, 16
+
+#CHECK: error: invalid operand
+#CHECK: cvb %r0, -1
+#CHECK: error: invalid operand
+#CHECK: cvb %r0, 4096
+
+ cvb %r0, -1
+ cvb %r0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: cvbg %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: cvbg %r0, 524288
+
+ cvbg %r0, -524289
+ cvbg %r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: cvby %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: cvby %r0, 524288
+
+ cvby %r0, -524289
+ cvby %r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: cvd %r0, -1
+#CHECK: error: invalid operand
+#CHECK: cvd %r0, 4096
+
+ cvd %r0, -1
+ cvd %r0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: cvdg %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: cvdg %r0, 524288
+
+ cvdg %r0, -524289
+ cvdg %r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: cvdy %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: cvdy %r0, 524288
+
+ cvdy %r0, -524289
+ cvdy %r0, 524288
+
#CHECK: error: invalid register pair
#CHECK: cxbr %f0, %f2
#CHECK: error: invalid register pair
@@ -1597,6 +1995,17 @@
cy %r0, 524288
#CHECK: error: invalid operand
+#CHECK: d %r0, -1
+#CHECK: error: invalid operand
+#CHECK: d %r0, 4096
+#CHECK: error: invalid register pair
+#CHECK: d %r1, 0
+
+ d %r0, -1
+ d %r0, 4096
+ d %r1, 0
+
+#CHECK: error: invalid operand
#CHECK: ddb %f0, -1
#CHECK: error: invalid operand
#CHECK: ddb %f0, 4096
@@ -1613,6 +2022,22 @@
deb %f0, 4096
#CHECK: error: invalid operand
+#CHECK: didbr %f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: didbr %f0, %f0, %f0, 16
+
+ didbr %f0, %f0, %f0, -1
+ didbr %f0, %f0, %f0, 16
+
+#CHECK: error: invalid operand
+#CHECK: diebr %f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: diebr %f0, %f0, %f0, 16
+
+ diebr %f0, %f0, %f0, -1
+ diebr %f0, %f0, %f0, 16
+
+#CHECK: error: invalid operand
#CHECK: dl %r0, -524289
#CHECK: error: invalid operand
#CHECK: dl %r0, 524288
@@ -1623,6 +2048,11 @@
dl %r0, 524288
dl %r1, 0
+#CHECK: error: invalid register pair
+#CHECK: dr %r1, %r0
+
+ dr %r1, %r0
+
#CHECK: error: invalid operand
#CHECK: dlg %r0, -524289
#CHECK: error: invalid operand
@@ -1644,6 +2074,59 @@
dlr %r1, %r0
+#CHECK: error: missing length in address
+#CHECK: dp 0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: dp 0(1), 0
+#CHECK: error: missing length in address
+#CHECK: dp 0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: dp 0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: dp 0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp 0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp 0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp 0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp -1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp 4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp 0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: dp 0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: dp 0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: dp 0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: dp 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: dp 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: dp 0(-), 0(1)
+
+ dp 0, 0(1)
+ dp 0(1), 0
+ dp 0(%r1), 0(1,%r1)
+ dp 0(1,%r1), 0(%r1)
+ dp 0(0,%r1), 0(1,%r1)
+ dp 0(1,%r1), 0(0,%r1)
+ dp 0(17,%r1), 0(1,%r1)
+ dp 0(1,%r1), 0(17,%r1)
+ dp -1(1,%r1), 0(1,%r1)
+ dp 4096(1,%r1), 0(1,%r1)
+ dp 0(1,%r1), -1(1,%r1)
+ dp 0(1,%r1), 4096(1,%r1)
+ dp 0(1,%r0), 0(1,%r1)
+ dp 0(1,%r1), 0(1,%r0)
+ dp 0(%r1,%r2), 0(1,%r1)
+ dp 0(1,%r2), 0(%r1,%r2)
+ dp 0(-), 0(1)
+
#CHECK: error: invalid operand
#CHECK: dsg %r0, -524289
#CHECK: error: invalid operand
@@ -1685,12 +2168,15 @@
dxbr %f2, %f0
#CHECK: error: invalid operand
-#CHECK: ex %r0, -1
+#CHECK: ecag %r0, %r0, -524289
#CHECK: error: invalid operand
-#CHECK: ex %r0, 4096
+#CHECK: ecag %r0, %r0, 524288
+#CHECK: error: invalid use of indexed addressing
+#CHECK: ecag %r0, %r0, 0(%r1,%r2)
- ex %r0, -1
- ex %r0, 4096
+ ecag %r0, %r0, -524289
+ ecag %r0, %r0, 524288
+ ecag %r0, %r0, 0(%r1,%r2)
#CHECK: error: invalid use of indexed addressing
#CHECK: ectg 160(%r1,%r15),160(%r15), %r2
@@ -1709,6 +2195,102 @@
ectg 0(%r1),-1(%r15), %r2
ectg 0(%r1),4096(%r15), %r2
+#CHECK: error: missing length in address
+#CHECK: ed 0, 0
+#CHECK: error: missing length in address
+#CHECK: ed 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: ed 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: ed 0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: ed 0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: ed -1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: ed 4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: ed 0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: ed 0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: ed 0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: ed 0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: ed 0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: ed 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: ed 0(-), 0
+
+ ed 0, 0
+ ed 0(%r1), 0(%r1)
+ ed 0(1,%r1), 0(2,%r1)
+ ed 0(0,%r1), 0(%r1)
+ ed 0(257,%r1), 0(%r1)
+ ed -1(1,%r1), 0(%r1)
+ ed 4096(1,%r1), 0(%r1)
+ ed 0(1,%r1), -1(%r1)
+ ed 0(1,%r1), 4096(%r1)
+ ed 0(1,%r0), 0(%r1)
+ ed 0(1,%r1), 0(%r0)
+ ed 0(%r1,%r2), 0(%r1)
+ ed 0(1,%r2), 0(%r1,%r2)
+ ed 0(-), 0
+
+#CHECK: error: missing length in address
+#CHECK: edmk 0, 0
+#CHECK: error: missing length in address
+#CHECK: edmk 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: edmk 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk 0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk 0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk -1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk 4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk 0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: edmk 0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: edmk 0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: edmk 0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: edmk 0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: edmk 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: edmk 0(-), 0
+
+ edmk 0, 0
+ edmk 0(%r1), 0(%r1)
+ edmk 0(1,%r1), 0(2,%r1)
+ edmk 0(0,%r1), 0(%r1)
+ edmk 0(257,%r1), 0(%r1)
+ edmk -1(1,%r1), 0(%r1)
+ edmk 4096(1,%r1), 0(%r1)
+ edmk 0(1,%r1), -1(%r1)
+ edmk 0(1,%r1), 4096(%r1)
+ edmk 0(1,%r0), 0(%r1)
+ edmk 0(1,%r1), 0(%r0)
+ edmk 0(%r1,%r2), 0(%r1)
+ edmk 0(1,%r2), 0(%r1,%r2)
+ edmk 0(-), 0
+
+#CHECK: error: invalid operand
+#CHECK: ex %r0, -1
+#CHECK: error: invalid operand
+#CHECK: ex %r0, 4096
+
+ ex %r0, -1
+ ex %r0, 4096
+
#CHECK: error: invalid operand
#CHECK: fidbr %f0, -1, %f0
#CHECK: error: invalid operand
@@ -1866,6 +2448,76 @@
iill %r0, 0x10000
#CHECK: error: invalid operand
+#CHECK: kdb %f0, -1
+#CHECK: error: invalid operand
+#CHECK: kdb %f0, 4096
+
+ kdb %f0, -1
+ kdb %f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: keb %f0, -1
+#CHECK: error: invalid operand
+#CHECK: keb %f0, 4096
+
+ keb %f0, -1
+ keb %f0, 4096
+
+#CHECK: error: invalid register pair
+#CHECK: kimd %r0, %r1
+
+ kimd %r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: klmd %r0, %r1
+
+ klmd %r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: km %r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: km %r2, %r1
+
+ km %r1, %r2
+ km %r2, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: kmac %r0, %r1
+
+ kmac %r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: kmc %r1, %r2
+#CHECK: error: invalid register pair
+#CHECK: kmc %r2, %r1
+
+ kmc %r1, %r2
+ kmc %r2, %r1
+
+#CHECK: error: instruction requires: message-security-assist-extension4
+#CHECK: kmctr %r2, %r4, %r6
+
+ kmctr %r2, %r4, %r6
+
+#CHECK: error: instruction requires: message-security-assist-extension4
+#CHECK: kmf %r2, %r4
+
+ kmf %r2, %r4
+
+#CHECK: error: instruction requires: message-security-assist-extension4
+#CHECK: kmo %r2, %r4
+
+ kmo %r2, %r4
+
+#CHECK: error: invalid register pair
+#CHECK: kxbr %f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: kxbr %f2, %f0
+
+ kxbr %f0, %f2
+ kxbr %f2, %f0
+
+#CHECK: error: invalid operand
#CHECK: l %r0, -1
#CHECK: error: invalid operand
#CHECK: l %r0, 4096
@@ -1881,14 +2533,6 @@
la %r0, -1
la %r0, 4096
-#CHECK: error: invalid operand
-#CHECK: lae %r0, -1
-#CHECK: error: invalid operand
-#CHECK: lae %r0, 4096
-
- lae %r0, -1
- lae %r0, 4096
-
#CHECK: error: instruction requires: interlocked-access1
#CHECK: laa %r1, %r2, 100(%r3)
laa %r1, %r2, 100(%r3)
@@ -1906,6 +2550,14 @@
laalg %r1, %r2, 100(%r3)
#CHECK: error: invalid operand
+#CHECK: lae %r0, -1
+#CHECK: error: invalid operand
+#CHECK: lae %r0, 4096
+
+ lae %r0, -1
+ lae %r0, 4096
+
+#CHECK: error: invalid operand
#CHECK: laey %r0, -524289
#CHECK: error: invalid operand
#CHECK: laey %r0, 524288
@@ -1948,14 +2600,6 @@
#CHECK: laog %r1, %r2, 100(%r3)
laog %r1, %r2, 100(%r3)
-#CHECK: error: instruction requires: interlocked-access1
-#CHECK: lax %r1, %r2, 100(%r3)
- lax %r1, %r2, 100(%r3)
-
-#CHECK: error: instruction requires: interlocked-access1
-#CHECK: laxg %r1, %r2, 100(%r3)
- laxg %r1, %r2, 100(%r3)
-
#CHECK: error: offset out of range
#CHECK: larl %r0, -0x1000000002
#CHECK: error: offset out of range
@@ -1970,6 +2614,14 @@
larl %r0, 1
larl %r0, 0x100000000
+#CHECK: error: instruction requires: interlocked-access1
+#CHECK: lax %r1, %r2, 100(%r3)
+ lax %r1, %r2, 100(%r3)
+
+#CHECK: error: instruction requires: interlocked-access1
+#CHECK: laxg %r1, %r2, 100(%r3)
+ laxg %r1, %r2, 100(%r3)
+
#CHECK: error: invalid operand
#CHECK: lay %r0, -524289
#CHECK: error: invalid operand
@@ -2070,11 +2722,6 @@
ley %f0, -524289
ley %f0, 524288
-#CHECK: error: instruction requires: high-word
-#CHECK: lfh %r0, 0
-
- lfh %r0, 0
-
#CHECK: error: invalid operand
#CHECK: lfas -1
#CHECK: error: invalid operand
@@ -2086,6 +2733,11 @@
lfas 4096
lfas 0(%r1,%r2)
+#CHECK: error: instruction requires: high-word
+#CHECK: lfh %r0, 0
+
+ lfh %r0, 0
+
#CHECK: error: invalid operand
#CHECK: lfpc -1
#CHECK: error: invalid operand
@@ -2258,14 +2910,6 @@
llgc %r0, 524288
#CHECK: error: invalid operand
-#CHECK: llgt %r0, -524289
-#CHECK: error: invalid operand
-#CHECK: llgt %r0, 524288
-
- llgt %r0, -524289
- llgt %r0, 524288
-
-#CHECK: error: invalid operand
#CHECK: llgf %r0, -524289
#CHECK: error: invalid operand
#CHECK: llgf %r0, 524288
@@ -2310,6 +2954,14 @@
llghrl %r0, 0x100000000
#CHECK: error: invalid operand
+#CHECK: llgt %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: llgt %r0, 524288
+
+ llgt %r0, -524289
+ llgt %r0, 524288
+
+#CHECK: error: invalid operand
#CHECK: llh %r0, -524289
#CHECK: error: invalid operand
#CHECK: llh %r0, 524288
@@ -2392,6 +3044,23 @@
lm %r0, %r0, 4096
lm %r0, %r0, 0(%r1,%r2)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: lmd %r2, %r4, 160(%r1,%r15), 160(%r15)
+#CHECK: error: invalid operand
+#CHECK: lmd %r2, %r4, -1(%r1), 160(%r15)
+#CHECK: error: invalid operand
+#CHECK: lmd %r2, %r4, 4096(%r1), 160(%r15)
+#CHECK: error: invalid operand
+#CHECK: lmd %r2, %r4, 0(%r1), -1(%r15)
+#CHECK: error: invalid operand
+#CHECK: lmd %r2, %r4, 0(%r1), 4096(%r15)
+
+ lmd %r2, %r4, 160(%r1,%r15), 160(%r15)
+ lmd %r2, %r4, -1(%r1), 160(%r15)
+ lmd %r2, %r4, 4096(%r1), 160(%r15)
+ lmd %r2, %r4, 0(%r1), -1(%r15)
+ lmd %r2, %r4, 0(%r1), 4096(%r15)
+
#CHECK: error: invalid operand
#CHECK: lmg %r0, %r0, -524289
#CHECK: error: invalid operand
@@ -2544,6 +3213,17 @@
lzxr %f2
#CHECK: error: invalid operand
+#CHECK: m %r0, -1
+#CHECK: error: invalid operand
+#CHECK: m %r0, 4096
+#CHECK: error: invalid register pair
+#CHECK: m %r1, 0
+
+ m %r0, -1
+ m %r0, 4096
+ m %r1, 0
+
+#CHECK: error: invalid operand
#CHECK: madb %f0, %f0, -1
#CHECK: error: invalid operand
#CHECK: madb %f0, %f0, 4096
@@ -2560,6 +3240,23 @@
maeb %f0, %f0, 4096
#CHECK: error: invalid operand
+#CHECK: mc -1, 0
+#CHECK: error: invalid operand
+#CHECK: mc 4096, 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mc 0(%r1,%r2), 0
+#CHECK: error: invalid operand
+#CHECK: mc 0, -1
+#CHECK: error: invalid operand
+#CHECK: mc 0, 256
+
+ mc -1, 0
+ mc 4096, 0
+ mc 0(%r1,%r2), 0
+ mc 0, -1
+ mc 0, 256
+
+#CHECK: error: invalid operand
#CHECK: mdb %f0, -1
#CHECK: error: invalid operand
#CHECK: mdb %f0, 4096
@@ -2584,6 +3281,17 @@
meeb %f0, 4096
#CHECK: error: invalid operand
+#CHECK: mfy %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: mfy %r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: mfy %r1, 0
+
+ mfy %r0, -524289
+ mfy %r0, 524288
+ mfy %r1, 0
+
+#CHECK: error: invalid operand
#CHECK: mghi %r0, -32769
#CHECK: error: invalid operand
#CHECK: mghi %r0, 32768
@@ -2622,6 +3330,17 @@
mhy %r0, 524288
#CHECK: error: invalid operand
+#CHECK: ml %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: ml %r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: ml %r1, 0
+
+ ml %r0, -524289
+ ml %r0, 524288
+ ml %r1, 0
+
+#CHECK: error: invalid operand
#CHECK: mlg %r0, -524289
#CHECK: error: invalid operand
#CHECK: mlg %r0, 524288
@@ -2637,6 +3356,69 @@
mlgr %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: mlr %r1, %r0
+
+ mlr %r1, %r0
+
+#CHECK: error: missing length in address
+#CHECK: mp 0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: mp 0(1), 0
+#CHECK: error: missing length in address
+#CHECK: mp 0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: mp 0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mp 0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp 0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp 0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp 0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp -1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp 4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp 0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mp 0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mp 0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mp 0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mp 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mp 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: mp 0(-), 0(1)
+
+ mp 0, 0(1)
+ mp 0(1), 0
+ mp 0(%r1), 0(1,%r1)
+ mp 0(1,%r1), 0(%r1)
+ mp 0(0,%r1), 0(1,%r1)
+ mp 0(1,%r1), 0(0,%r1)
+ mp 0(17,%r1), 0(1,%r1)
+ mp 0(1,%r1), 0(17,%r1)
+ mp -1(1,%r1), 0(1,%r1)
+ mp 4096(1,%r1), 0(1,%r1)
+ mp 0(1,%r1), -1(1,%r1)
+ mp 0(1,%r1), 4096(1,%r1)
+ mp 0(1,%r0), 0(1,%r1)
+ mp 0(1,%r1), 0(1,%r0)
+ mp 0(%r1,%r2), 0(1,%r1)
+ mp 0(1,%r2), 0(%r1,%r2)
+ mp 0(-), 0(1)
+
+#CHECK: error: invalid register pair
+#CHECK: mr %r1, %r0
+
+ mr %r1, %r0
+
#CHECK: error: invalid operand
#CHECK: ms %r0, -1
#CHECK: error: invalid operand
@@ -2745,6 +3527,50 @@
mvc 0(1,%r2), 0(%r1,%r2)
mvc 0(-), 0
+#CHECK: error: missing length in address
+#CHECK: mvcin 0, 0
+#CHECK: error: missing length in address
+#CHECK: mvcin 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: mvcin 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin 0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin 0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin -1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin 4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin 0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvcin 0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvcin 0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvcin 0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvcin 0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvcin 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: mvcin 0(-), 0
+
+ mvcin 0, 0
+ mvcin 0(%r1), 0(%r1)
+ mvcin 0(1,%r1), 0(2,%r1)
+ mvcin 0(0,%r1), 0(%r1)
+ mvcin 0(257,%r1), 0(%r1)
+ mvcin -1(1,%r1), 0(%r1)
+ mvcin 4096(1,%r1), 0(%r1)
+ mvcin 0(1,%r1), -1(%r1)
+ mvcin 0(1,%r1), 4096(%r1)
+ mvcin 0(1,%r0), 0(%r1)
+ mvcin 0(1,%r1), 0(%r0)
+ mvcin 0(%r1,%r2), 0(%r1)
+ mvcin 0(1,%r2), 0(%r1,%r2)
+ mvcin 0(-), 0
+
#CHECK: error: invalid use of length addressing
#CHECK: mvck 0(%r1,%r1), 0(2,%r1), %r3
#CHECK: error: invalid operand
@@ -2774,6 +3600,42 @@
mvck 0(%r1,%r2), 0(%r1,%r2), %r3
mvck 0(-), 0, %r3
+#CHECK: error: invalid register pair
+#CHECK: mvcl %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: mvcl %r0, %r1
+
+ mvcl %r1, %r0
+ mvcl %r0, %r1
+
+#CHECK: error: invalid register pair
+#CHECK: mvcle %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: mvcle %r0, %r1
+#CHECK: error: invalid operand
+#CHECK: mvcle %r0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: mvcle %r0, %r0, 4096
+
+ mvcle %r1, %r0, 0
+ mvcle %r0, %r1, 0
+ mvcle %r0, %r0, -1
+ mvcle %r0, %r0, 4096
+
+#CHECK: error: invalid register pair
+#CHECK: mvclu %r1, %r0
+#CHECK: error: invalid register pair
+#CHECK: mvclu %r0, %r1
+#CHECK: error: invalid operand
+#CHECK: mvclu %r0, %r0, -524289
+#CHECK: error: invalid operand
+#CHECK: mvclu %r0, %r0, 524288
+
+ mvclu %r1, %r0, 0
+ mvclu %r0, %r1, 0
+ mvclu %r0, %r0, -524289
+ mvclu %r0, %r0, 524288
+
#CHECK: error: invalid operand
#CHECK: mvghi -1, 0
#CHECK: error: invalid operand
@@ -2859,6 +3721,147 @@
mviy 0, -1
mviy 0, 256
+#CHECK: error: missing length in address
+#CHECK: mvn 0, 0
+#CHECK: error: missing length in address
+#CHECK: mvn 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: mvn 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn 0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn 0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn -1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn 4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn 0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvn 0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvn 0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvn 0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvn 0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvn 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: mvn 0(-), 0
+
+ mvn 0, 0
+ mvn 0(%r1), 0(%r1)
+ mvn 0(1,%r1), 0(2,%r1)
+ mvn 0(0,%r1), 0(%r1)
+ mvn 0(257,%r1), 0(%r1)
+ mvn -1(1,%r1), 0(%r1)
+ mvn 4096(1,%r1), 0(%r1)
+ mvn 0(1,%r1), -1(%r1)
+ mvn 0(1,%r1), 4096(%r1)
+ mvn 0(1,%r0), 0(%r1)
+ mvn 0(1,%r1), 0(%r0)
+ mvn 0(%r1,%r2), 0(%r1)
+ mvn 0(1,%r2), 0(%r1,%r2)
+ mvn 0(-), 0
+
+#CHECK: error: missing length in address
+#CHECK: mvo 0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: mvo 0(1), 0
+#CHECK: error: missing length in address
+#CHECK: mvo 0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: mvo 0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo 0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo 0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo 0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo 0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo -1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo 4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo 0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvo 0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvo 0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvo 0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvo 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvo 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: mvo 0(-), 0(1)
+
+ mvo 0, 0(1)
+ mvo 0(1), 0
+ mvo 0(%r1), 0(1,%r1)
+ mvo 0(1,%r1), 0(%r1)
+ mvo 0(0,%r1), 0(1,%r1)
+ mvo 0(1,%r1), 0(0,%r1)
+ mvo 0(17,%r1), 0(1,%r1)
+ mvo 0(1,%r1), 0(17,%r1)
+ mvo -1(1,%r1), 0(1,%r1)
+ mvo 4096(1,%r1), 0(1,%r1)
+ mvo 0(1,%r1), -1(1,%r1)
+ mvo 0(1,%r1), 4096(1,%r1)
+ mvo 0(1,%r0), 0(1,%r1)
+ mvo 0(1,%r1), 0(1,%r0)
+ mvo 0(%r1,%r2), 0(1,%r1)
+ mvo 0(1,%r2), 0(%r1,%r2)
+ mvo 0(-), 0(1)
+
+#CHECK: error: missing length in address
+#CHECK: mvz 0, 0
+#CHECK: error: missing length in address
+#CHECK: mvz 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: mvz 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz 0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz 0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz -1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz 4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz 0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: mvz 0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvz 0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: mvz 0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvz 0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: mvz 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: mvz 0(-), 0
+
+ mvz 0, 0
+ mvz 0(%r1), 0(%r1)
+ mvz 0(1,%r1), 0(2,%r1)
+ mvz 0(0,%r1), 0(%r1)
+ mvz 0(257,%r1), 0(%r1)
+ mvz -1(1,%r1), 0(%r1)
+ mvz 4096(1,%r1), 0(%r1)
+ mvz 0(1,%r1), -1(%r1)
+ mvz 0(1,%r1), 4096(%r1)
+ mvz 0(1,%r0), 0(%r1)
+ mvz 0(1,%r1), 0(%r0)
+ mvz 0(%r1,%r2), 0(%r1)
+ mvz 0(1,%r2), 0(%r1,%r2)
+ mvz 0(-), 0
+
#CHECK: error: invalid register pair
#CHECK: mxbr %f0, %f2
#CHECK: error: invalid register pair
@@ -3203,6 +4206,64 @@
oy %r0, -524289
oy %r0, 524288
+#CHECK: error: missing length in address
+#CHECK: pack 0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: pack 0(1), 0
+#CHECK: error: missing length in address
+#CHECK: pack 0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: pack 0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: pack 0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack 0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack 0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack 0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack -1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack 4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack 0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pack 0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pack 0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pack 0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pack 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pack 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: pack 0(-), 0(1)
+
+ pack 0, 0(1)
+ pack 0(1), 0
+ pack 0(%r1), 0(1,%r1)
+ pack 0(1,%r1), 0(%r1)
+ pack 0(0,%r1), 0(1,%r1)
+ pack 0(1,%r1), 0(0,%r1)
+ pack 0(17,%r1), 0(1,%r1)
+ pack 0(1,%r1), 0(17,%r1)
+ pack -1(1,%r1), 0(1,%r1)
+ pack 4096(1,%r1), 0(1,%r1)
+ pack 0(1,%r1), -1(1,%r1)
+ pack 0(1,%r1), 4096(1,%r1)
+ pack 0(1,%r0), 0(1,%r1)
+ pack 0(1,%r1), 0(1,%r0)
+ pack 0(%r1,%r2), 0(1,%r1)
+ pack 0(1,%r2), 0(%r1,%r2)
+ pack 0(-), 0(1)
+
+#CHECK: error: instruction requires: message-security-assist-extension4
+#CHECK: pcc
+
+ pcc
+
#CHECK: error: invalid operand
#CHECK: pfd -1, 0
#CHECK: error: invalid operand
@@ -3237,6 +4298,94 @@
pfdrl 1, 1
pfdrl 1, 0x100000000
+#CHECK: error: missing length in address
+#CHECK: pka 0, 0
+#CHECK: error: missing length in address
+#CHECK: pka 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: pka 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka 0(%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka 0(%r1), 0(257,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka -1(%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka 4096(%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka 0(%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pka 0(%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pka 0(%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pka 0(%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pka 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pka 0(%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: pka 0, 0(-)
+
+ pka 0, 0
+ pka 0(%r1), 0(%r1)
+ pka 0(1,%r1), 0(2,%r1)
+ pka 0(%r1), 0(0,%r1)
+ pka 0(%r1), 0(257,%r1)
+ pka -1(%r1), 0(1,%r1)
+ pka 4096(%r1), 0(1,%r1)
+ pka 0(%r1), -1(1,%r1)
+ pka 0(%r1), 4096(1,%r1)
+ pka 0(%r0), 0(1,%r1)
+ pka 0(%r1), 0(1,%r0)
+ pka 0(%r1,%r2), 0(1,%r1)
+ pka 0(%r2), 0(%r1,%r2)
+ pka 0, 0(-)
+
+#CHECK: error: missing length in address
+#CHECK: pku 0, 0
+#CHECK: error: missing length in address
+#CHECK: pku 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: pku 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku 0(%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku 0(%r1), 0(257,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku -1(%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku 4096(%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku 0(%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: pku 0(%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pku 0(%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: pku 0(%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pku 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: pku 0(%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: pku 0, 0(-)
+
+ pku 0, 0
+ pku 0(%r1), 0(%r1)
+ pku 0(1,%r1), 0(2,%r1)
+ pku 0(%r1), 0(0,%r1)
+ pku 0(%r1), 0(257,%r1)
+ pku -1(%r1), 0(1,%r1)
+ pku 4096(%r1), 0(1,%r1)
+ pku 0(%r1), -1(1,%r1)
+ pku 0(%r1), 4096(1,%r1)
+ pku 0(%r0), 0(1,%r1)
+ pku 0(%r1), 0(1,%r0)
+ pku 0(%r1,%r2), 0(1,%r1)
+ pku 0(%r2), 0(%r1,%r2)
+ pku 0, 0(-)
+
#CHECK: error: invalid use of indexed addressing
#CHECK: plo %r2, 160(%r1,%r15), %r4, 160(%r15)
#CHECK: error: invalid operand
@@ -3294,6 +4443,34 @@
risblg %r1, %r2, 0, 0, 0
#CHECK: error: invalid operand
+#CHECK: rll %r0,%r0,-524289
+#CHECK: error: invalid operand
+#CHECK: rll %r0,%r0,524288
+#CHECK: error: %r0 used in an address
+#CHECK: rll %r0,%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: rll %r0,%r0,0(%r1,%r2)
+
+ rll %r0,%r0,-524289
+ rll %r0,%r0,524288
+ rll %r0,%r0,0(%r0)
+ rll %r0,%r0,0(%r1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: rllg %r0,%r0,-524289
+#CHECK: error: invalid operand
+#CHECK: rllg %r0,%r0,524288
+#CHECK: error: %r0 used in an address
+#CHECK: rllg %r0,%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: rllg %r0,%r0,0(%r1,%r2)
+
+ rllg %r0,%r0,-524289
+ rllg %r0,%r0,524288
+ rllg %r0,%r0,0(%r0)
+ rllg %r0,%r0,0(%r1,%r2)
+
+#CHECK: error: invalid operand
#CHECK: rnsbg %r0,%r0,0,0,-1
#CHECK: error: invalid operand
#CHECK: rnsbg %r0,%r0,0,0,64
@@ -3354,34 +4531,6 @@
rxsbg %r0,%r0,256,0,0
#CHECK: error: invalid operand
-#CHECK: rll %r0,%r0,-524289
-#CHECK: error: invalid operand
-#CHECK: rll %r0,%r0,524288
-#CHECK: error: %r0 used in an address
-#CHECK: rll %r0,%r0,0(%r0)
-#CHECK: error: invalid use of indexed addressing
-#CHECK: rll %r0,%r0,0(%r1,%r2)
-
- rll %r0,%r0,-524289
- rll %r0,%r0,524288
- rll %r0,%r0,0(%r0)
- rll %r0,%r0,0(%r1,%r2)
-
-#CHECK: error: invalid operand
-#CHECK: rllg %r0,%r0,-524289
-#CHECK: error: invalid operand
-#CHECK: rllg %r0,%r0,524288
-#CHECK: error: %r0 used in an address
-#CHECK: rllg %r0,%r0,0(%r0)
-#CHECK: error: invalid use of indexed addressing
-#CHECK: rllg %r0,%r0,0(%r1,%r2)
-
- rllg %r0,%r0,-524289
- rllg %r0,%r0,524288
- rllg %r0,%r0,0(%r0)
- rllg %r0,%r0,0(%r1,%r2)
-
-#CHECK: error: invalid operand
#CHECK: s %r0, -1
#CHECK: error: invalid operand
#CHECK: s %r0, 4096
@@ -3451,6 +4600,39 @@
sl %r0, 4096
#CHECK: error: invalid operand
+#CHECK: sla %r0,-1
+#CHECK: error: invalid operand
+#CHECK: sla %r0,4096
+#CHECK: error: %r0 used in an address
+#CHECK: sla %r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: sla %r0,0(%r1,%r2)
+
+ sla %r0,-1
+ sla %r0,4096
+ sla %r0,0(%r0)
+ sla %r0,0(%r1,%r2)
+
+#CHECK: error: invalid operand
+#CHECK: slag %r0,%r0,-524289
+#CHECK: error: invalid operand
+#CHECK: slag %r0,%r0,524288
+#CHECK: error: %r0 used in an address
+#CHECK: slag %r0,%r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: slag %r0,%r0,0(%r1,%r2)
+
+ slag %r0,%r0,-524289
+ slag %r0,%r0,524288
+ slag %r0,%r0,0(%r0)
+ slag %r0,%r0,0(%r1,%r2)
+
+#CHECK: error: instruction requires: distinct-ops
+#CHECK: slak %r2,%r3,4(%r5)
+
+ slak %r2,%r3,4(%r5)
+
+#CHECK: error: invalid operand
#CHECK: slb %r0, -524289
#CHECK: error: invalid operand
#CHECK: slb %r0, 524288
@@ -3466,6 +4648,40 @@
slbg %r0, -524289
slbg %r0, 524288
+#CHECK: error: invalid register pair
+#CHECK: slda %r1,0
+#CHECK: error: invalid operand
+#CHECK: slda %r0,-1
+#CHECK: error: invalid operand
+#CHECK: slda %r0,4096
+#CHECK: error: %r0 used in an address
+#CHECK: slda %r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: slda %r0,0(%r1,%r2)
+
+ slda %r1,0
+ slda %r0,-1
+ slda %r0,4096
+ slda %r0,0(%r0)
+ slda %r0,0(%r1,%r2)
+
+#CHECK: error: invalid register pair
+#CHECK: sldl %r1,0
+#CHECK: error: invalid operand
+#CHECK: sldl %r0,-1
+#CHECK: error: invalid operand
+#CHECK: sldl %r0,4096
+#CHECK: error: %r0 used in an address
+#CHECK: sldl %r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: sldl %r0,0(%r1,%r2)
+
+ sldl %r1,0
+ sldl %r0,-1
+ sldl %r0,4096
+ sldl %r0,0(%r0)
+ sldl %r0,0(%r1,%r2)
+
#CHECK: error: invalid operand
#CHECK: slfi %r0, -1
#CHECK: error: invalid operand
@@ -3504,25 +4720,6 @@
slgrk %r2,%r3,%r4
#CHECK: error: invalid operand
-#CHECK: sla %r0,-1
-#CHECK: error: invalid operand
-#CHECK: sla %r0,4096
-#CHECK: error: %r0 used in an address
-#CHECK: sla %r0,0(%r0)
-#CHECK: error: invalid use of indexed addressing
-#CHECK: sla %r0,0(%r1,%r2)
-
- sla %r0,-1
- sla %r0,4096
- sla %r0,0(%r0)
- sla %r0,0(%r1,%r2)
-
-#CHECK: error: instruction requires: distinct-ops
-#CHECK: slak %r2,%r3,4(%r5)
-
- slak %r2,%r3,4(%r5)
-
-#CHECK: error: invalid operand
#CHECK: sll %r0,-1
#CHECK: error: invalid operand
#CHECK: sll %r0,4096
@@ -3568,6 +4765,59 @@
sly %r0, -524289
sly %r0, 524288
+#CHECK: error: missing length in address
+#CHECK: sp 0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: sp 0(1), 0
+#CHECK: error: missing length in address
+#CHECK: sp 0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: sp 0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: sp 0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp 0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp 0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp 0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp -1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp 4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp 0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: sp 0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: sp 0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: sp 0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: sp 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: sp 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: sp 0(-), 0(1)
+
+ sp 0, 0(1)
+ sp 0(1), 0
+ sp 0(%r1), 0(1,%r1)
+ sp 0(1,%r1), 0(%r1)
+ sp 0(0,%r1), 0(1,%r1)
+ sp 0(1,%r1), 0(0,%r1)
+ sp 0(17,%r1), 0(1,%r1)
+ sp 0(1,%r1), 0(17,%r1)
+ sp -1(1,%r1), 0(1,%r1)
+ sp 4096(1,%r1), 0(1,%r1)
+ sp 0(1,%r1), -1(1,%r1)
+ sp 0(1,%r1), 4096(1,%r1)
+ sp 0(1,%r0), 0(1,%r1)
+ sp 0(1,%r1), 0(1,%r0)
+ sp 0(%r1,%r2), 0(1,%r1)
+ sp 0(1,%r2), 0(%r1,%r2)
+ sp 0(-), 0(1)
+
#CHECK: error: invalid operand
#CHECK: sqdb %f0, -1
#CHECK: error: invalid operand
@@ -3625,6 +4875,40 @@
srak %r2,%r3,4(%r5)
+#CHECK: error: invalid register pair
+#CHECK: srda %r1,0
+#CHECK: error: invalid operand
+#CHECK: srda %r0,-1
+#CHECK: error: invalid operand
+#CHECK: srda %r0,4096
+#CHECK: error: %r0 used in an address
+#CHECK: srda %r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: srda %r0,0(%r1,%r2)
+
+ srda %r1,0
+ srda %r0,-1
+ srda %r0,4096
+ srda %r0,0(%r0)
+ srda %r0,0(%r1,%r2)
+
+#CHECK: error: invalid register pair
+#CHECK: srdl %r1,0
+#CHECK: error: invalid operand
+#CHECK: srdl %r0,-1
+#CHECK: error: invalid operand
+#CHECK: srdl %r0,4096
+#CHECK: error: %r0 used in an address
+#CHECK: srdl %r0,0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: srdl %r0,0(%r1,%r2)
+
+ srdl %r1,0
+ srdl %r0,-1
+ srdl %r0,4096
+ srdl %r0,0(%r0)
+ srdl %r0,0(%r1,%r2)
+
#CHECK: error: instruction requires: distinct-ops
#CHECK: srk %r2,%r3,%r4
@@ -3690,6 +4974,56 @@
srnmt 4096
srnmt 0(%r1,%r2)
+#CHECK: error: missing length in address
+#CHECK: srp 0, 0, 0
+#CHECK: error: missing length in address
+#CHECK: srp 0(%r1), 0(%r1), 0
+#CHECK: error: invalid use of length addressing
+#CHECK: srp 0(1,%r1), 0(2,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp 0(0,%r1), 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp 0(17,%r1), 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp -1(1,%r1), 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp 4096(1,%r1), 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp 0(1,%r1), -1(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: srp 0(1,%r1), 4096(%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: srp 0(1,%r0), 0(%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: srp 0(1,%r1), 0(%r0), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: srp 0(%r1,%r2), 0(%r1), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: srp 0(1,%r2), 0(%r1,%r2), 0
+#CHECK: error: invalid operand
+#CHECK: srp 0(1), 0, -1
+#CHECK: error: invalid operand
+#CHECK: srp 0(1), 0, 16
+#CHECK: error: unknown token in expression
+#CHECK: srp 0(-), 0, 0
+
+ srp 0, 0, 0
+ srp 0(%r1), 0(%r1), 0
+ srp 0(1,%r1), 0(2,%r1), 0
+ srp 0(0,%r1), 0(%r1), 0
+ srp 0(17,%r1), 0(%r1), 0
+ srp -1(1,%r1), 0(%r1), 0
+ srp 4096(1,%r1), 0(%r1), 0
+ srp 0(1,%r1), -1(%r1), 0
+ srp 0(1,%r1), 4096(%r1), 0
+ srp 0(1,%r0), 0(%r1), 0
+ srp 0(1,%r1), 0(%r0), 0
+ srp 0(%r1,%r2), 0(%r1), 0
+ srp 0(1,%r2), 0(%r1,%r2), 0
+ srp 0(1), 0, -1
+ srp 0(1), 0, 16
+ srp 0(-), 0, 0
+
#CHECK: error: invalid operand
#CHECK: st %r0, -1
#CHECK: error: invalid operand
@@ -3731,6 +5065,48 @@
stch %r0, 0
#CHECK: error: invalid operand
+#CHECK: stcm %r0, 0, -1
+#CHECK: error: invalid operand
+#CHECK: stcm %r0, 0, 4096
+#CHECK: error: invalid operand
+#CHECK: stcm %r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: stcm %r0, 16, 0
+
+ stcm %r0, 0, -1
+ stcm %r0, 0, 4096
+ stcm %r0, -1, 0
+ stcm %r0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: stcmy %r0, 0, -524289
+#CHECK: error: invalid operand
+#CHECK: stcmy %r0, 0, 524288
+#CHECK: error: invalid operand
+#CHECK: stcmy %r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: stcmy %r0, 16, 0
+
+ stcmy %r0, 0, -524289
+ stcmy %r0, 0, 524288
+ stcmy %r0, -1, 0
+ stcmy %r0, 16, 0
+
+#CHECK: error: invalid operand
+#CHECK: stcmy %r0, 0, -524289
+#CHECK: error: invalid operand
+#CHECK: stcmy %r0, 0, 524288
+#CHECK: error: invalid operand
+#CHECK: stcmy %r0, -1, 0
+#CHECK: error: invalid operand
+#CHECK: stcmy %r0, 16, 0
+
+ stcmy %r0, 0, -524289
+ stcmy %r0, 0, 524288
+ stcmy %r0, -1, 0
+ stcmy %r0, 16, 0
+
+#CHECK: error: invalid operand
#CHECK: stcy %r0, -524289
#CHECK: error: invalid operand
#CHECK: stcy %r0, 524288
@@ -3770,6 +5146,11 @@
stey %f0, -524289
stey %f0, 524288
+#CHECK: error: instruction requires: high-word
+#CHECK: stfh %r0, 0
+
+ stfh %r0, 0
+
#CHECK: error: invalid operand
#CHECK: stfpc -1
#CHECK: error: invalid operand
@@ -3838,11 +5219,6 @@
sthy %r0, -524289
sthy %r0, 524288
-#CHECK: error: instruction requires: high-word
-#CHECK: stfh %r0, 0
-
- stfh %r0, 0
-
#CHECK: error: invalid operand
#CHECK: stm %r0, %r0, 4096
#CHECK: error: invalid use of indexed addressing
@@ -4008,6 +5384,14 @@
tm 0, 256
#CHECK: error: invalid operand
+#CHECK: tmh %r0, -1
+#CHECK: error: invalid operand
+#CHECK: tmh %r0, 0x10000
+
+ tmh %r0, -1
+ tmh %r0, 0x10000
+
+#CHECK: error: invalid operand
#CHECK: tmhh %r0, -1
#CHECK: error: invalid operand
#CHECK: tmhh %r0, 0x10000
@@ -4024,12 +5408,12 @@
tmhl %r0, 0x10000
#CHECK: error: invalid operand
-#CHECK: tmh %r0, -1
+#CHECK: tml %r0, -1
#CHECK: error: invalid operand
-#CHECK: tmh %r0, 0x10000
+#CHECK: tml %r0, 0x10000
- tmh %r0, -1
- tmh %r0, 0x10000
+ tml %r0, -1
+ tml %r0, 0x10000
#CHECK: error: invalid operand
#CHECK: tmlh %r0, -1
@@ -4040,14 +5424,6 @@
tmlh %r0, 0x10000
#CHECK: error: invalid operand
-#CHECK: tml %r0, -1
-#CHECK: error: invalid operand
-#CHECK: tml %r0, 0x10000
-
- tml %r0, -1
- tml %r0, 0x10000
-
-#CHECK: error: invalid operand
#CHECK: tmll %r0, -1
#CHECK: error: invalid operand
#CHECK: tmll %r0, 0x10000
@@ -4072,6 +5448,238 @@
tmy 0, -1
tmy 0, 256
+#CHECK: error: missing length in address
+#CHECK: tp 0
+#CHECK: error: missing length in address
+#CHECK: tp 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: tp 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: tp 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: tp -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: tp 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: tp 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: tp 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: tp 0(-)
+
+ tp 0
+ tp 0(%r1)
+ tp 0(0,%r1)
+ tp 0(17,%r1)
+ tp -1(1,%r1)
+ tp 4096(1,%r1)
+ tp 0(1,%r0)
+ tp 0(%r1,%r2)
+ tp 0(-)
+
+#CHECK: error: missing length in address
+#CHECK: tr 0, 0
+#CHECK: error: missing length in address
+#CHECK: tr 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: tr 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: tr 0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: tr 0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: tr -1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: tr 4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: tr 0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: tr 0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: tr 0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: tr 0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: tr 0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: tr 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: tr 0(-), 0
+
+ tr 0, 0
+ tr 0(%r1), 0(%r1)
+ tr 0(1,%r1), 0(2,%r1)
+ tr 0(0,%r1), 0(%r1)
+ tr 0(257,%r1), 0(%r1)
+ tr -1(1,%r1), 0(%r1)
+ tr 4096(1,%r1), 0(%r1)
+ tr 0(1,%r1), -1(%r1)
+ tr 0(1,%r1), 4096(%r1)
+ tr 0(1,%r0), 0(%r1)
+ tr 0(1,%r1), 0(%r0)
+ tr 0(%r1,%r2), 0(%r1)
+ tr 0(1,%r2), 0(%r1,%r2)
+ tr 0(-), 0
+
+#CHECK: error: invalid register pair
+#CHECK: tre %r1, %r0
+
+ tre %r1, %r0
+
+#CHECK: error: invalid register pair
+#CHECK: troo %r1, %r0
+#CHECK: error: invalid operand
+#CHECK: troo %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: troo %r2, %r4, 16
+
+ troo %r1, %r0
+ troo %r2, %r4, -1
+ troo %r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: trot %r1, %r0
+#CHECK: error: invalid operand
+#CHECK: trot %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: trot %r2, %r4, 16
+
+ trot %r1, %r0
+ trot %r2, %r4, -1
+ trot %r2, %r4, 16
+
+#CHECK: error: missing length in address
+#CHECK: trt 0, 0
+#CHECK: error: missing length in address
+#CHECK: trt 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: trt 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: trt 0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trt 0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trt -1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trt 4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trt 0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: trt 0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: trt 0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: trt 0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: trt 0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: trt 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: trt 0(-), 0
+
+ trt 0, 0
+ trt 0(%r1), 0(%r1)
+ trt 0(1,%r1), 0(2,%r1)
+ trt 0(0,%r1), 0(%r1)
+ trt 0(257,%r1), 0(%r1)
+ trt -1(1,%r1), 0(%r1)
+ trt 4096(1,%r1), 0(%r1)
+ trt 0(1,%r1), -1(%r1)
+ trt 0(1,%r1), 4096(%r1)
+ trt 0(1,%r0), 0(%r1)
+ trt 0(1,%r1), 0(%r0)
+ trt 0(%r1,%r2), 0(%r1)
+ trt 0(1,%r2), 0(%r1,%r2)
+ trt 0(-), 0
+
+#CHECK: error: invalid register pair
+#CHECK: trte %r1, %r0
+#CHECK: error: invalid operand
+#CHECK: trte %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: trte %r2, %r4, 16
+
+ trte %r1, %r0
+ trte %r2, %r4, -1
+ trte %r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: trto %r1, %r0
+#CHECK: error: invalid operand
+#CHECK: trto %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: trto %r2, %r4, 16
+
+ trto %r1, %r0
+ trto %r2, %r4, -1
+ trto %r2, %r4, 16
+
+#CHECK: error: missing length in address
+#CHECK: trtr 0, 0
+#CHECK: error: missing length in address
+#CHECK: trtr 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: trtr 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr 0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr 0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr -1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr 4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr 0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: trtr 0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: trtr 0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: trtr 0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: trtr 0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: trtr 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: trtr 0(-), 0
+
+ trtr 0, 0
+ trtr 0(%r1), 0(%r1)
+ trtr 0(1,%r1), 0(2,%r1)
+ trtr 0(0,%r1), 0(%r1)
+ trtr 0(257,%r1), 0(%r1)
+ trtr -1(1,%r1), 0(%r1)
+ trtr 4096(1,%r1), 0(%r1)
+ trtr 0(1,%r1), -1(%r1)
+ trtr 0(1,%r1), 4096(%r1)
+ trtr 0(1,%r0), 0(%r1)
+ trtr 0(1,%r1), 0(%r0)
+ trtr 0(%r1,%r2), 0(%r1)
+ trtr 0(1,%r2), 0(%r1,%r2)
+ trtr 0(-), 0
+
+#CHECK: error: invalid register pair
+#CHECK: trtre %r1, %r0
+#CHECK: error: invalid operand
+#CHECK: trtre %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: trtre %r2, %r4, 16
+
+ trtre %r1, %r0
+ trtre %r2, %r4, -1
+ trtre %r2, %r4, 16
+
+#CHECK: error: invalid register pair
+#CHECK: trtt %r1, %r0
+#CHECK: error: invalid operand
+#CHECK: trtt %r2, %r4, -1
+#CHECK: error: invalid operand
+#CHECK: trtt %r2, %r4, 16
+
+ trtt %r1, %r0
+ trtt %r2, %r4, -1
+ trtt %r2, %r4, 16
+
#CHECK: error: invalid operand
#CHECK: ts -1
#CHECK: error: invalid operand
@@ -4083,6 +5691,147 @@
ts 4096
ts 0(%r1,%r2)
+#CHECK: error: missing length in address
+#CHECK: unpk 0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: unpk 0(1), 0
+#CHECK: error: missing length in address
+#CHECK: unpk 0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: unpk 0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk 0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk 0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk 0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk 0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk -1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk 4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk 0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpk 0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpk 0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpk 0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpk 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpk 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: unpk 0(-), 0(1)
+
+ unpk 0, 0(1)
+ unpk 0(1), 0
+ unpk 0(%r1), 0(1,%r1)
+ unpk 0(1,%r1), 0(%r1)
+ unpk 0(0,%r1), 0(1,%r1)
+ unpk 0(1,%r1), 0(0,%r1)
+ unpk 0(17,%r1), 0(1,%r1)
+ unpk 0(1,%r1), 0(17,%r1)
+ unpk -1(1,%r1), 0(1,%r1)
+ unpk 4096(1,%r1), 0(1,%r1)
+ unpk 0(1,%r1), -1(1,%r1)
+ unpk 0(1,%r1), 4096(1,%r1)
+ unpk 0(1,%r0), 0(1,%r1)
+ unpk 0(1,%r1), 0(1,%r0)
+ unpk 0(%r1,%r2), 0(1,%r1)
+ unpk 0(1,%r2), 0(%r1,%r2)
+ unpk 0(-), 0(1)
+
+#CHECK: error: missing length in address
+#CHECK: unpka 0, 0
+#CHECK: error: missing length in address
+#CHECK: unpka 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: unpka 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka 0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka 0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka -1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka 4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka 0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpka 0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpka 0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpka 0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpka 0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpka 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: unpka 0(-), 0
+
+ unpka 0, 0
+ unpka 0(%r1), 0(%r1)
+ unpka 0(1,%r1), 0(2,%r1)
+ unpka 0(0,%r1), 0(%r1)
+ unpka 0(257,%r1), 0(%r1)
+ unpka -1(1,%r1), 0(%r1)
+ unpka 4096(1,%r1), 0(%r1)
+ unpka 0(1,%r1), -1(%r1)
+ unpka 0(1,%r1), 4096(%r1)
+ unpka 0(1,%r0), 0(%r1)
+ unpka 0(1,%r1), 0(%r0)
+ unpka 0(%r1,%r2), 0(%r1)
+ unpka 0(1,%r2), 0(%r1,%r2)
+ unpka 0(-), 0
+
+#CHECK: error: missing length in address
+#CHECK: unpku 0, 0
+#CHECK: error: missing length in address
+#CHECK: unpku 0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: unpku 0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku 0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku 0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku -1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku 4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku 0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: unpku 0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpku 0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: unpku 0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpku 0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: unpku 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: unpku 0(-), 0
+
+ unpku 0, 0
+ unpku 0(%r1), 0(%r1)
+ unpku 0(1,%r1), 0(2,%r1)
+ unpku 0(0,%r1), 0(%r1)
+ unpku 0(257,%r1), 0(%r1)
+ unpku -1(1,%r1), 0(%r1)
+ unpku 4096(1,%r1), 0(%r1)
+ unpku 0(1,%r1), -1(%r1)
+ unpku 0(1,%r1), 4096(%r1)
+ unpku 0(1,%r0), 0(%r1)
+ unpku 0(1,%r1), 0(%r0)
+ unpku 0(%r1,%r2), 0(%r1)
+ unpku 0(1,%r2), 0(%r1,%r2)
+ unpku 0(-), 0
+
#CHECK: error: invalid operand
#CHECK: x %r0, -1
#CHECK: error: invalid operand
@@ -4210,3 +5959,56 @@
xy %r0, -524289
xy %r0, 524288
+
+#CHECK: error: missing length in address
+#CHECK: zap 0, 0(1)
+#CHECK: error: missing length in address
+#CHECK: zap 0(1), 0
+#CHECK: error: missing length in address
+#CHECK: zap 0(%r1), 0(1,%r1)
+#CHECK: error: missing length in address
+#CHECK: zap 0(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: zap 0(0,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap 0(1,%r1), 0(0,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap 0(17,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap 0(1,%r1), 0(17,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap -1(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap 4096(1,%r1), 0(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap 0(1,%r1), -1(1,%r1)
+#CHECK: error: invalid operand
+#CHECK: zap 0(1,%r1), 4096(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: zap 0(1,%r0), 0(1,%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: zap 0(1,%r1), 0(1,%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: zap 0(%r1,%r2), 0(1,%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: zap 0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: zap 0(-), 0(1)
+
+ zap 0, 0(1)
+ zap 0(1), 0
+ zap 0(%r1), 0(1,%r1)
+ zap 0(1,%r1), 0(%r1)
+ zap 0(0,%r1), 0(1,%r1)
+ zap 0(1,%r1), 0(0,%r1)
+ zap 0(17,%r1), 0(1,%r1)
+ zap 0(1,%r1), 0(17,%r1)
+ zap -1(1,%r1), 0(1,%r1)
+ zap 4096(1,%r1), 0(1,%r1)
+ zap 0(1,%r1), -1(1,%r1)
+ zap 0(1,%r1), 4096(1,%r1)
+ zap 0(1,%r0), 0(1,%r1)
+ zap 0(1,%r1), 0(1,%r0)
+ zap 0(%r1,%r2), 0(1,%r1)
+ zap 0(1,%r2), 0(%r1,%r2)
+ zap 0(-), 0(1)
diff --git a/test/MC/SystemZ/insn-good-z13.s b/test/MC/SystemZ/insn-good-z13.s
index 4fd6a664a29d..cbfcfa9a89af 100644
--- a/test/MC/SystemZ/insn-good-z13.s
+++ b/test/MC/SystemZ/insn-good-z13.s
@@ -4,16 +4,264 @@
# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=arch11 -show-encoding %s \
# RUN: | FileCheck %s
-#CHECK: lzrf %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3b]
-#CHECK: lzrf %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3b]
-#CHECK: lzrf %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3b]
-#CHECK: lzrf %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3b]
-#CHECK: lzrf %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3b]
-#CHECK: lzrf %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3b]
-#CHECK: lzrf %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3b]
-#CHECK: lzrf %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3b]
-#CHECK: lzrf %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3b]
-#CHECK: lzrf %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3b]
+#CHECK: lcbb %r0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x27]
+#CHECK: lcbb %r0, 0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x27]
+#CHECK: lcbb %r0, 4095, 0 # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x27]
+#CHECK: lcbb %r0, 0(%r15), 0 # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x27]
+#CHECK: lcbb %r0, 0(%r15,%r1), 0 # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x27]
+#CHECK: lcbb %r15, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x27]
+#CHECK: lcbb %r2, 1383(%r3,%r4), 8 # encoding: [0xe7,0x23,0x45,0x67,0x80,0x27]
+
+ lcbb %r0, 0, 0
+ lcbb %r0, 0, 15
+ lcbb %r0, 4095, 0
+ lcbb %r0, 0(%r15), 0
+ lcbb %r0, 0(%r15,%r1), 0
+ lcbb %r15, 0, 0
+ lcbb %r2, 1383(%r3,%r4), 8
+
+#CHECK: llzrgf %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3a]
+#CHECK: llzrgf %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3a]
+#CHECK: llzrgf %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3a]
+#CHECK: llzrgf %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3a]
+#CHECK: llzrgf %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3a]
+#CHECK: llzrgf %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3a]
+#CHECK: llzrgf %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3a]
+#CHECK: llzrgf %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3a]
+#CHECK: llzrgf %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3a]
+#CHECK: llzrgf %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3a]
+
+ llzrgf %r0, -524288
+ llzrgf %r0, -1
+ llzrgf %r0, 0
+ llzrgf %r0, 1
+ llzrgf %r0, 524287
+ llzrgf %r0, 0(%r1)
+ llzrgf %r0, 0(%r15)
+ llzrgf %r0, 524287(%r1,%r15)
+ llzrgf %r0, 524287(%r15,%r1)
+ llzrgf %r15, 0
+
+#CHECK: lochi %r11, 42, 0 # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x42]
+#CHECK: lochio %r11, 42 # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x42]
+#CHECK: lochih %r11, 42 # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x42]
+#CHECK: lochinle %r11, 42 # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x42]
+#CHECK: lochil %r11, -1 # encoding: [0xec,0xb4,0xff,0xff,0x00,0x42]
+#CHECK: lochinhe %r11, 42 # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x42]
+#CHECK: lochilh %r11, -1 # encoding: [0xec,0xb6,0xff,0xff,0x00,0x42]
+#CHECK: lochine %r11, 0 # encoding: [0xec,0xb7,0x00,0x00,0x00,0x42]
+#CHECK: lochie %r11, 0 # encoding: [0xec,0xb8,0x00,0x00,0x00,0x42]
+#CHECK: lochinlh %r11, 42 # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x42]
+#CHECK: lochihe %r11, 255 # encoding: [0xec,0xba,0x00,0xff,0x00,0x42]
+#CHECK: lochinl %r11, 255 # encoding: [0xec,0xbb,0x00,0xff,0x00,0x42]
+#CHECK: lochile %r11, 32767 # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x42]
+#CHECK: lochinh %r11, 32767 # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x42]
+#CHECK: lochino %r11, 32512 # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x42]
+#CHECK: lochi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x42]
+
+ lochi %r11, 42, 0
+ lochio %r11, 42
+ lochih %r11, 42
+ lochinle %r11, 42
+ lochil %r11, -1
+ lochinhe %r11, 42
+ lochilh %r11, -1
+ lochine %r11, 0
+ lochie %r11, 0
+ lochinlh %r11, 42
+ lochihe %r11, 255
+ lochinl %r11, 255
+ lochile %r11, 32767
+ lochinh %r11, 32767
+ lochino %r11, 32512
+ lochi %r11, 32512, 15
+
+#CHECK: locghi %r11, 42, 0 # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x46]
+#CHECK: locghio %r11, 42 # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x46]
+#CHECK: locghih %r11, 42 # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x46]
+#CHECK: locghinle %r11, 42 # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x46]
+#CHECK: locghil %r11, -1 # encoding: [0xec,0xb4,0xff,0xff,0x00,0x46]
+#CHECK: locghinhe %r11, 42 # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x46]
+#CHECK: locghilh %r11, -1 # encoding: [0xec,0xb6,0xff,0xff,0x00,0x46]
+#CHECK: locghine %r11, 0 # encoding: [0xec,0xb7,0x00,0x00,0x00,0x46]
+#CHECK: locghie %r11, 0 # encoding: [0xec,0xb8,0x00,0x00,0x00,0x46]
+#CHECK: locghinlh %r11, 42 # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x46]
+#CHECK: locghihe %r11, 255 # encoding: [0xec,0xba,0x00,0xff,0x00,0x46]
+#CHECK: locghinl %r11, 255 # encoding: [0xec,0xbb,0x00,0xff,0x00,0x46]
+#CHECK: locghile %r11, 32767 # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x46]
+#CHECK: locghinh %r11, 32767 # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x46]
+#CHECK: locghino %r11, 32512 # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x46]
+#CHECK: locghi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x46]
+
+ locghi %r11, 42, 0
+ locghio %r11, 42
+ locghih %r11, 42
+ locghinle %r11, 42
+ locghil %r11, -1
+ locghinhe %r11, 42
+ locghilh %r11, -1
+ locghine %r11, 0
+ locghie %r11, 0
+ locghinlh %r11, 42
+ locghihe %r11, 255
+ locghinl %r11, 255
+ locghile %r11, 32767
+ locghinh %r11, 32767
+ locghino %r11, 32512
+ locghi %r11, 32512, 15
+
+#CHECK: lochhi %r11, 42, 0 # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhio %r11, 42 # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhih %r11, 42 # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhinle %r11, 42 # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhil %r11, -1 # encoding: [0xec,0xb4,0xff,0xff,0x00,0x4e]
+#CHECK: lochhinhe %r11, 42 # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhilh %r11, -1 # encoding: [0xec,0xb6,0xff,0xff,0x00,0x4e]
+#CHECK: lochhine %r11, 0 # encoding: [0xec,0xb7,0x00,0x00,0x00,0x4e]
+#CHECK: lochhie %r11, 0 # encoding: [0xec,0xb8,0x00,0x00,0x00,0x4e]
+#CHECK: lochhinlh %r11, 42 # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x4e]
+#CHECK: lochhihe %r11, 255 # encoding: [0xec,0xba,0x00,0xff,0x00,0x4e]
+#CHECK: lochhinl %r11, 255 # encoding: [0xec,0xbb,0x00,0xff,0x00,0x4e]
+#CHECK: lochhile %r11, 32767 # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x4e]
+#CHECK: lochhinh %r11, 32767 # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x4e]
+#CHECK: lochhino %r11, 32512 # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x4e]
+#CHECK: lochhi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x4e]
+
+ lochhi %r11, 42, 0
+ lochhio %r11, 42
+ lochhih %r11, 42
+ lochhinle %r11, 42
+ lochhil %r11, -1
+ lochhinhe %r11, 42
+ lochhilh %r11, -1
+ lochhine %r11, 0
+ lochhie %r11, 0
+ lochhinlh %r11, 42
+ lochhihe %r11, 255
+ lochhinl %r11, 255
+ lochhile %r11, 32767
+ lochhinh %r11, 32767
+ lochhino %r11, 32512
+ lochhi %r11, 32512, 15
+
+#CHECK: locfh %r0, 0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0xe0]
+#CHECK: locfh %r0, 0, 15 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xe0]
+#CHECK: locfh %r0, -524288, 0 # encoding: [0xeb,0x00,0x00,0x00,0x80,0xe0]
+#CHECK: locfh %r0, 524287, 0 # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0xe0]
+#CHECK: locfh %r0, 0(%r1), 0 # encoding: [0xeb,0x00,0x10,0x00,0x00,0xe0]
+#CHECK: locfh %r0, 0(%r15), 0 # encoding: [0xeb,0x00,0xf0,0x00,0x00,0xe0]
+#CHECK: locfh %r15, 0, 0 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0xe0]
+#CHECK: locfh %r1, 4095(%r2), 3 # encoding: [0xeb,0x13,0x2f,0xff,0x00,0xe0]
+
+ locfh %r0, 0, 0
+ locfh %r0, 0, 15
+ locfh %r0, -524288, 0
+ locfh %r0, 524287, 0
+ locfh %r0, 0(%r1), 0
+ locfh %r0, 0(%r15), 0
+ locfh %r15, 0, 0
+ locfh %r1, 4095(%r2), 3
+
+#CHECK: locfho %r1, 2(%r3) # encoding: [0xeb,0x11,0x30,0x02,0x00,0xe0]
+#CHECK: locfhh %r1, 2(%r3) # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe0]
+#CHECK: locfhp %r1, 2(%r3) # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnle %r1, 2(%r3) # encoding: [0xeb,0x13,0x30,0x02,0x00,0xe0]
+#CHECK: locfhl %r1, 2(%r3) # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe0]
+#CHECK: locfhm %r1, 2(%r3) # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnhe %r1, 2(%r3) # encoding: [0xeb,0x15,0x30,0x02,0x00,0xe0]
+#CHECK: locfhlh %r1, 2(%r3) # encoding: [0xeb,0x16,0x30,0x02,0x00,0xe0]
+#CHECK: locfhne %r1, 2(%r3) # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnz %r1, 2(%r3) # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe0]
+#CHECK: locfhe %r1, 2(%r3) # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe0]
+#CHECK: locfhz %r1, 2(%r3) # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnlh %r1, 2(%r3) # encoding: [0xeb,0x19,0x30,0x02,0x00,0xe0]
+#CHECK: locfhhe %r1, 2(%r3) # encoding: [0xeb,0x1a,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnl %r1, 2(%r3) # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnm %r1, 2(%r3) # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe0]
+#CHECK: locfhle %r1, 2(%r3) # encoding: [0xeb,0x1c,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnh %r1, 2(%r3) # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe0]
+#CHECK: locfhnp %r1, 2(%r3) # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe0]
+#CHECK: locfhno %r1, 2(%r3) # encoding: [0xeb,0x1e,0x30,0x02,0x00,0xe0]
+
+ locfho %r1, 2(%r3)
+ locfhh %r1, 2(%r3)
+ locfhp %r1, 2(%r3)
+ locfhnle %r1, 2(%r3)
+ locfhl %r1, 2(%r3)
+ locfhm %r1, 2(%r3)
+ locfhnhe %r1, 2(%r3)
+ locfhlh %r1, 2(%r3)
+ locfhne %r1, 2(%r3)
+ locfhnz %r1, 2(%r3)
+ locfhe %r1, 2(%r3)
+ locfhz %r1, 2(%r3)
+ locfhnlh %r1, 2(%r3)
+ locfhhe %r1, 2(%r3)
+ locfhnl %r1, 2(%r3)
+ locfhnm %r1, 2(%r3)
+ locfhle %r1, 2(%r3)
+ locfhnh %r1, 2(%r3)
+ locfhnp %r1, 2(%r3)
+ locfhno %r1, 2(%r3)
+
+#CHECK: locfhr %r1, %r2, 0 # encoding: [0xb9,0xe0,0x00,0x12]
+#CHECK: locfhr %r1, %r2, 15 # encoding: [0xb9,0xe0,0xf0,0x12]
+
+ locfhr %r1, %r2, 0
+ locfhr %r1, %r2, 15
+
+#CHECK: locfhro %r1, %r3 # encoding: [0xb9,0xe0,0x10,0x13]
+#CHECK: locfhrh %r1, %r3 # encoding: [0xb9,0xe0,0x20,0x13]
+#CHECK: locfhrp %r1, %r3 # encoding: [0xb9,0xe0,0x20,0x13]
+#CHECK: locfhrnle %r1, %r3 # encoding: [0xb9,0xe0,0x30,0x13]
+#CHECK: locfhrl %r1, %r3 # encoding: [0xb9,0xe0,0x40,0x13]
+#CHECK: locfhrm %r1, %r3 # encoding: [0xb9,0xe0,0x40,0x13]
+#CHECK: locfhrnhe %r1, %r3 # encoding: [0xb9,0xe0,0x50,0x13]
+#CHECK: locfhrlh %r1, %r3 # encoding: [0xb9,0xe0,0x60,0x13]
+#CHECK: locfhrne %r1, %r3 # encoding: [0xb9,0xe0,0x70,0x13]
+#CHECK: locfhrnz %r1, %r3 # encoding: [0xb9,0xe0,0x70,0x13]
+#CHECK: locfhre %r1, %r3 # encoding: [0xb9,0xe0,0x80,0x13]
+#CHECK: locfhrz %r1, %r3 # encoding: [0xb9,0xe0,0x80,0x13]
+#CHECK: locfhrnlh %r1, %r3 # encoding: [0xb9,0xe0,0x90,0x13]
+#CHECK: locfhrhe %r1, %r3 # encoding: [0xb9,0xe0,0xa0,0x13]
+#CHECK: locfhrnl %r1, %r3 # encoding: [0xb9,0xe0,0xb0,0x13]
+#CHECK: locfhrnm %r1, %r3 # encoding: [0xb9,0xe0,0xb0,0x13]
+#CHECK: locfhrle %r1, %r3 # encoding: [0xb9,0xe0,0xc0,0x13]
+#CHECK: locfhrnh %r1, %r3 # encoding: [0xb9,0xe0,0xd0,0x13]
+#CHECK: locfhrnp %r1, %r3 # encoding: [0xb9,0xe0,0xd0,0x13]
+#CHECK: locfhrno %r1, %r3 # encoding: [0xb9,0xe0,0xe0,0x13]
+
+ locfhro %r1, %r3
+ locfhrh %r1, %r3
+ locfhrp %r1, %r3
+ locfhrnle %r1, %r3
+ locfhrl %r1, %r3
+ locfhrm %r1, %r3
+ locfhrnhe %r1, %r3
+ locfhrlh %r1, %r3
+ locfhrne %r1, %r3
+ locfhrnz %r1, %r3
+ locfhre %r1, %r3
+ locfhrz %r1, %r3
+ locfhrnlh %r1, %r3
+ locfhrhe %r1, %r3
+ locfhrnl %r1, %r3
+ locfhrnm %r1, %r3
+ locfhrle %r1, %r3
+ locfhrnh %r1, %r3
+ locfhrnp %r1, %r3
+ locfhrno %r1, %r3
+
+#CHECK: lzrf %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3b]
+#CHECK: lzrf %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3b]
+#CHECK: lzrf %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3b]
+#CHECK: lzrf %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3b]
+#CHECK: lzrf %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3b]
+#CHECK: lzrf %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3b]
+#CHECK: lzrf %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3b]
+#CHECK: lzrf %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3b]
+#CHECK: lzrf %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3b]
+#CHECK: lzrf %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3b]
lzrf %r0, -524288
lzrf %r0, -1
@@ -26,16 +274,16 @@
lzrf %r0, 524287(%r15,%r1)
lzrf %r15, 0
-#CHECK: lzrg %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x2a]
-#CHECK: lzrg %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x2a]
-#CHECK: lzrg %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x2a]
-#CHECK: lzrg %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x2a]
-#CHECK: lzrg %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x2a]
-#CHECK: lzrg %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x2a]
-#CHECK: lzrg %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x2a]
-#CHECK: lzrg %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x2a]
-#CHECK: lzrg %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x2a]
-#CHECK: lzrg %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x2a]
+#CHECK: lzrg %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x2a]
+#CHECK: lzrg %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x2a]
+#CHECK: lzrg %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x2a]
+#CHECK: lzrg %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x2a]
+#CHECK: lzrg %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x2a]
+#CHECK: lzrg %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x2a]
+#CHECK: lzrg %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x2a]
+#CHECK: lzrg %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x2a]
+#CHECK: lzrg %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x2a]
+#CHECK: lzrg %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x2a]
lzrg %r0, -524288
lzrg %r0, -1
@@ -48,43 +296,75 @@
lzrg %r0, 524287(%r15,%r1)
lzrg %r15, 0
-#CHECK: llzrgf %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3a]
-#CHECK: llzrgf %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3a]
-#CHECK: llzrgf %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3a]
-#CHECK: llzrgf %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3a]
-#CHECK: llzrgf %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3a]
-#CHECK: llzrgf %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3a]
-#CHECK: llzrgf %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3a]
-#CHECK: llzrgf %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3a]
-#CHECK: llzrgf %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3a]
-#CHECK: llzrgf %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3a]
+#CHECK: stocfh %r0, 0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0xe1]
+#CHECK: stocfh %r0, 0, 15 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xe1]
+#CHECK: stocfh %r0, -524288, 0 # encoding: [0xeb,0x00,0x00,0x00,0x80,0xe1]
+#CHECK: stocfh %r0, 524287, 0 # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0xe1]
+#CHECK: stocfh %r0, 0(%r1), 0 # encoding: [0xeb,0x00,0x10,0x00,0x00,0xe1]
+#CHECK: stocfh %r0, 0(%r15), 0 # encoding: [0xeb,0x00,0xf0,0x00,0x00,0xe1]
+#CHECK: stocfh %r15, 0, 0 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0xe1]
+#CHECK: stocfh %r1, 4095(%r2), 3 # encoding: [0xeb,0x13,0x2f,0xff,0x00,0xe1]
+
+ stocfh %r0, 0, 0
+ stocfh %r0, 0, 15
+ stocfh %r0, -524288, 0
+ stocfh %r0, 524287, 0
+ stocfh %r0, 0(%r1), 0
+ stocfh %r0, 0(%r15), 0
+ stocfh %r15, 0, 0
+ stocfh %r1, 4095(%r2), 3
- llzrgf %r0, -524288
- llzrgf %r0, -1
- llzrgf %r0, 0
- llzrgf %r0, 1
- llzrgf %r0, 524287
- llzrgf %r0, 0(%r1)
- llzrgf %r0, 0(%r15)
- llzrgf %r0, 524287(%r1,%r15)
- llzrgf %r0, 524287(%r15,%r1)
- llzrgf %r15, 0
+#CHECK: stocfho %r1, 2(%r3) # encoding: [0xeb,0x11,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhh %r1, 2(%r3) # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhp %r1, 2(%r3) # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnle %r1, 2(%r3) # encoding: [0xeb,0x13,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhl %r1, 2(%r3) # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhm %r1, 2(%r3) # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnhe %r1, 2(%r3) # encoding: [0xeb,0x15,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhlh %r1, 2(%r3) # encoding: [0xeb,0x16,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhne %r1, 2(%r3) # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnz %r1, 2(%r3) # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhe %r1, 2(%r3) # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhz %r1, 2(%r3) # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnlh %r1, 2(%r3) # encoding: [0xeb,0x19,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhhe %r1, 2(%r3) # encoding: [0xeb,0x1a,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnl %r1, 2(%r3) # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnm %r1, 2(%r3) # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhle %r1, 2(%r3) # encoding: [0xeb,0x1c,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnh %r1, 2(%r3) # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhnp %r1, 2(%r3) # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe1]
+#CHECK: stocfhno %r1, 2(%r3) # encoding: [0xeb,0x1e,0x30,0x02,0x00,0xe1]
-#CHECK: lcbb %r0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x27]
-#CHECK: lcbb %r0, 0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x27]
-#CHECK: lcbb %r0, 4095, 0 # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x27]
-#CHECK: lcbb %r0, 0(%r15), 0 # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x27]
-#CHECK: lcbb %r0, 0(%r15,%r1), 0 # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x27]
-#CHECK: lcbb %r15, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x27]
-#CHECK: lcbb %r2, 1383(%r3,%r4), 8 # encoding: [0xe7,0x23,0x45,0x67,0x80,0x27]
-
- lcbb %r0, 0, 0
- lcbb %r0, 0, 15
- lcbb %r0, 4095, 0
- lcbb %r0, 0(%r15), 0
- lcbb %r0, 0(%r15,%r1), 0
- lcbb %r15, 0, 0
- lcbb %r2, 1383(%r3,%r4), 8
+ stocfho %r1, 2(%r3)
+ stocfhh %r1, 2(%r3)
+ stocfhp %r1, 2(%r3)
+ stocfhnle %r1, 2(%r3)
+ stocfhl %r1, 2(%r3)
+ stocfhm %r1, 2(%r3)
+ stocfhnhe %r1, 2(%r3)
+ stocfhlh %r1, 2(%r3)
+ stocfhne %r1, 2(%r3)
+ stocfhnz %r1, 2(%r3)
+ stocfhe %r1, 2(%r3)
+ stocfhz %r1, 2(%r3)
+ stocfhnlh %r1, 2(%r3)
+ stocfhhe %r1, 2(%r3)
+ stocfhnl %r1, 2(%r3)
+ stocfhnm %r1, 2(%r3)
+ stocfhle %r1, 2(%r3)
+ stocfhnh %r1, 2(%r3)
+ stocfhnp %r1, 2(%r3)
+ stocfhno %r1, 2(%r3)
+
+#CHECK: ppno %r2, %r2 # encoding: [0xb9,0x3c,0x00,0x22]
+#CHECK: ppno %r2, %r14 # encoding: [0xb9,0x3c,0x00,0x2e]
+#CHECK: ppno %r14, %r2 # encoding: [0xb9,0x3c,0x00,0xe2]
+#CHECK: ppno %r6, %r10 # encoding: [0xb9,0x3c,0x00,0x6a]
+
+ ppno %r2, %r2
+ ppno %r2, %r14
+ ppno %r14, %r2
+ ppno %r6, %r10
#CHECK: va %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf3]
#CHECK: va %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xf3]
@@ -486,18 +766,6 @@
vcdlgb %v31, %v0, 0, 0
vcdlgb %v14, %v17, 4, 10
-#CHECK: vcksm %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x66]
-#CHECK: vcksm %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x66]
-#CHECK: vcksm %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x66]
-#CHECK: vcksm %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x66]
-#CHECK: vcksm %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x66]
-
- vcksm %v0, %v0, %v0
- vcksm %v0, %v0, %v31
- vcksm %v0, %v31, %v0
- vcksm %v31, %v0, %v0
- vcksm %v18, %v3, %v20
-
#CHECK: vceq %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf8]
#CHECK: vceq %v0, %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xf8]
#CHECK: vceq %v0, %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xf8]
@@ -748,6 +1016,18 @@
vchlh %v18, %v3, %v20
vchlhs %v5, %v22, %v7
+#CHECK: vcksm %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x66]
+#CHECK: vcksm %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x66]
+#CHECK: vcksm %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x66]
+#CHECK: vcksm %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x66]
+#CHECK: vcksm %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x66]
+
+ vcksm %v0, %v0, %v0
+ vcksm %v0, %v0, %v31
+ vcksm %v0, %v31, %v0
+ vcksm %v31, %v0, %v0
+ vcksm %v18, %v3, %v20
+
#CHECK: vclgd %v0, %v0, 0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xc0]
#CHECK: vclgd %v0, %v0, 15, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xc0]
#CHECK: vclgd %v0, %v0, 0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xc0]
@@ -1142,68 +1422,6 @@
verimh %v31, %v0, %v0, 0
verimh %v13, %v17, %v21, 0x79
-#CHECK: verllv %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x73]
-#CHECK: verllv %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x73]
-#CHECK: verllv %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x73]
-#CHECK: verllv %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x73]
-#CHECK: verllv %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x73]
-#CHECK: verllv %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0x00,0xba,0x73]
-
- verllv %v0, %v0, %v0, 0
- verllv %v0, %v0, %v0, 15
- verllv %v0, %v0, %v31, 0
- verllv %v0, %v31, %v0, 0
- verllv %v31, %v0, %v0, 0
- verllv %v18, %v3, %v20, 11
-
-#CHECK: verllvb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x73]
-#CHECK: verllvb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x73]
-#CHECK: verllvb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x73]
-#CHECK: verllvb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x73]
-#CHECK: verllvb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x73]
-
- verllvb %v0, %v0, %v0
- verllvb %v0, %v0, %v31
- verllvb %v0, %v31, %v0
- verllvb %v31, %v0, %v0
- verllvb %v18, %v3, %v20
-
-#CHECK: verllvf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x73]
-#CHECK: verllvf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x73]
-#CHECK: verllvf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x73]
-#CHECK: verllvf %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x73]
-#CHECK: verllvf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x73]
-
- verllvf %v0, %v0, %v0
- verllvf %v0, %v0, %v31
- verllvf %v0, %v31, %v0
- verllvf %v31, %v0, %v0
- verllvf %v18, %v3, %v20
-
-#CHECK: verllvg %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0x73]
-#CHECK: verllvg %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x73]
-#CHECK: verllvg %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x73]
-#CHECK: verllvg %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x73]
-#CHECK: verllvg %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x73]
-
- verllvg %v0, %v0, %v0
- verllvg %v0, %v0, %v31
- verllvg %v0, %v31, %v0
- verllvg %v31, %v0, %v0
- verllvg %v18, %v3, %v20
-
-#CHECK: verllvh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x73]
-#CHECK: verllvh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x73]
-#CHECK: verllvh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x73]
-#CHECK: verllvh %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x73]
-#CHECK: verllvh %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x73]
-
- verllvh %v0, %v0, %v0
- verllvh %v0, %v0, %v31
- verllvh %v0, %v31, %v0
- verllvh %v31, %v0, %v0
- verllvh %v18, %v3, %v20
-
#CHECK: verll %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x33]
#CHECK: verll %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x33]
#CHECK: verll %v0, %v0, 4095, 0 # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x33]
@@ -1276,67 +1494,67 @@
verllh %v31, %v0, 0
verllh %v14, %v17, 1074(%r5)
-#CHECK: veslv %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x70]
-#CHECK: veslv %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x70]
-#CHECK: veslv %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x70]
-#CHECK: veslv %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x70]
-#CHECK: veslv %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x70]
-#CHECK: veslv %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0x00,0xba,0x70]
+#CHECK: verllv %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x73]
+#CHECK: verllv %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x73]
+#CHECK: verllv %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x73]
+#CHECK: verllv %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x73]
+#CHECK: verllv %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x73]
+#CHECK: verllv %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0x00,0xba,0x73]
- veslv %v0, %v0, %v0, 0
- veslv %v0, %v0, %v0, 15
- veslv %v0, %v0, %v31, 0
- veslv %v0, %v31, %v0, 0
- veslv %v31, %v0, %v0, 0
- veslv %v18, %v3, %v20, 11
+ verllv %v0, %v0, %v0, 0
+ verllv %v0, %v0, %v0, 15
+ verllv %v0, %v0, %v31, 0
+ verllv %v0, %v31, %v0, 0
+ verllv %v31, %v0, %v0, 0
+ verllv %v18, %v3, %v20, 11
-#CHECK: veslvb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x70]
-#CHECK: veslvb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x70]
-#CHECK: veslvb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x70]
-#CHECK: veslvb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x70]
-#CHECK: veslvb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x70]
+#CHECK: verllvb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x73]
+#CHECK: verllvb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x73]
+#CHECK: verllvb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x73]
+#CHECK: verllvb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x73]
+#CHECK: verllvb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x73]
- veslvb %v0, %v0, %v0
- veslvb %v0, %v0, %v31
- veslvb %v0, %v31, %v0
- veslvb %v31, %v0, %v0
- veslvb %v18, %v3, %v20
+ verllvb %v0, %v0, %v0
+ verllvb %v0, %v0, %v31
+ verllvb %v0, %v31, %v0
+ verllvb %v31, %v0, %v0
+ verllvb %v18, %v3, %v20
-#CHECK: veslvf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x70]
-#CHECK: veslvf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x70]
-#CHECK: veslvf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x70]
-#CHECK: veslvf %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x70]
-#CHECK: veslvf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x70]
+#CHECK: verllvf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x73]
+#CHECK: verllvf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x73]
+#CHECK: verllvf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x73]
+#CHECK: verllvf %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x73]
+#CHECK: verllvf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x73]
- veslvf %v0, %v0, %v0
- veslvf %v0, %v0, %v31
- veslvf %v0, %v31, %v0
- veslvf %v31, %v0, %v0
- veslvf %v18, %v3, %v20
+ verllvf %v0, %v0, %v0
+ verllvf %v0, %v0, %v31
+ verllvf %v0, %v31, %v0
+ verllvf %v31, %v0, %v0
+ verllvf %v18, %v3, %v20
-#CHECK: veslvg %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0x70]
-#CHECK: veslvg %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x70]
-#CHECK: veslvg %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x70]
-#CHECK: veslvg %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x70]
-#CHECK: veslvg %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x70]
+#CHECK: verllvg %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0x73]
+#CHECK: verllvg %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x73]
+#CHECK: verllvg %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x73]
+#CHECK: verllvg %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x73]
+#CHECK: verllvg %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x73]
- veslvg %v0, %v0, %v0
- veslvg %v0, %v0, %v31
- veslvg %v0, %v31, %v0
- veslvg %v31, %v0, %v0
- veslvg %v18, %v3, %v20
+ verllvg %v0, %v0, %v0
+ verllvg %v0, %v0, %v31
+ verllvg %v0, %v31, %v0
+ verllvg %v31, %v0, %v0
+ verllvg %v18, %v3, %v20
-#CHECK: veslvh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x70]
-#CHECK: veslvh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x70]
-#CHECK: veslvh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x70]
-#CHECK: veslvh %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x70]
-#CHECK: veslvh %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x70]
+#CHECK: verllvh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x73]
+#CHECK: verllvh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x73]
+#CHECK: verllvh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x73]
+#CHECK: verllvh %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x73]
+#CHECK: verllvh %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x73]
- veslvh %v0, %v0, %v0
- veslvh %v0, %v0, %v31
- veslvh %v0, %v31, %v0
- veslvh %v31, %v0, %v0
- veslvh %v18, %v3, %v20
+ verllvh %v0, %v0, %v0
+ verllvh %v0, %v0, %v31
+ verllvh %v0, %v31, %v0
+ verllvh %v31, %v0, %v0
+ verllvh %v18, %v3, %v20
#CHECK: vesl %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x30]
#CHECK: vesl %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x30]
@@ -1410,67 +1628,67 @@
veslh %v31, %v0, 0
veslh %v14, %v17, 1074(%r5)
-#CHECK: vesrav %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7a]
-#CHECK: vesrav %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x7a]
-#CHECK: vesrav %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7a]
-#CHECK: vesrav %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7a]
-#CHECK: vesrav %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7a]
-#CHECK: vesrav %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0x00,0xba,0x7a]
+#CHECK: veslv %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x70]
+#CHECK: veslv %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x70]
+#CHECK: veslv %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x70]
+#CHECK: veslv %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x70]
+#CHECK: veslv %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x70]
+#CHECK: veslv %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0x00,0xba,0x70]
- vesrav %v0, %v0, %v0, 0
- vesrav %v0, %v0, %v0, 15
- vesrav %v0, %v0, %v31, 0
- vesrav %v0, %v31, %v0, 0
- vesrav %v31, %v0, %v0, 0
- vesrav %v18, %v3, %v20, 11
+ veslv %v0, %v0, %v0, 0
+ veslv %v0, %v0, %v0, 15
+ veslv %v0, %v0, %v31, 0
+ veslv %v0, %v31, %v0, 0
+ veslv %v31, %v0, %v0, 0
+ veslv %v18, %v3, %v20, 11
-#CHECK: vesravb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7a]
-#CHECK: vesravb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7a]
-#CHECK: vesravb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7a]
-#CHECK: vesravb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7a]
-#CHECK: vesravb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x7a]
+#CHECK: veslvb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x70]
+#CHECK: veslvb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x70]
+#CHECK: veslvb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x70]
+#CHECK: veslvb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x70]
+#CHECK: veslvb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x70]
- vesravb %v0, %v0, %v0
- vesravb %v0, %v0, %v31
- vesravb %v0, %v31, %v0
- vesravb %v31, %v0, %v0
- vesravb %v18, %v3, %v20
+ veslvb %v0, %v0, %v0
+ veslvb %v0, %v0, %v31
+ veslvb %v0, %v31, %v0
+ veslvb %v31, %v0, %v0
+ veslvb %v18, %v3, %v20
-#CHECK: vesravf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x7a]
-#CHECK: vesravf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x7a]
-#CHECK: vesravf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x7a]
-#CHECK: vesravf %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x7a]
-#CHECK: vesravf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x7a]
+#CHECK: veslvf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x70]
+#CHECK: veslvf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x70]
+#CHECK: veslvf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x70]
+#CHECK: veslvf %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x70]
+#CHECK: veslvf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x70]
- vesravf %v0, %v0, %v0
- vesravf %v0, %v0, %v31
- vesravf %v0, %v31, %v0
- vesravf %v31, %v0, %v0
- vesravf %v18, %v3, %v20
+ veslvf %v0, %v0, %v0
+ veslvf %v0, %v0, %v31
+ veslvf %v0, %v31, %v0
+ veslvf %v31, %v0, %v0
+ veslvf %v18, %v3, %v20
-#CHECK: vesravg %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0x7a]
-#CHECK: vesravg %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x7a]
-#CHECK: vesravg %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x7a]
-#CHECK: vesravg %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x7a]
-#CHECK: vesravg %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x7a]
+#CHECK: veslvg %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0x70]
+#CHECK: veslvg %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x70]
+#CHECK: veslvg %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x70]
+#CHECK: veslvg %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x70]
+#CHECK: veslvg %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x70]
- vesravg %v0, %v0, %v0
- vesravg %v0, %v0, %v31
- vesravg %v0, %v31, %v0
- vesravg %v31, %v0, %v0
- vesravg %v18, %v3, %v20
+ veslvg %v0, %v0, %v0
+ veslvg %v0, %v0, %v31
+ veslvg %v0, %v31, %v0
+ veslvg %v31, %v0, %v0
+ veslvg %v18, %v3, %v20
-#CHECK: vesravh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x7a]
-#CHECK: vesravh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x7a]
-#CHECK: vesravh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x7a]
-#CHECK: vesravh %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x7a]
-#CHECK: vesravh %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x7a]
+#CHECK: veslvh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x70]
+#CHECK: veslvh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x70]
+#CHECK: veslvh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x70]
+#CHECK: veslvh %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x70]
+#CHECK: veslvh %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x70]
- vesravh %v0, %v0, %v0
- vesravh %v0, %v0, %v31
- vesravh %v0, %v31, %v0
- vesravh %v31, %v0, %v0
- vesravh %v18, %v3, %v20
+ veslvh %v0, %v0, %v0
+ veslvh %v0, %v0, %v31
+ veslvh %v0, %v31, %v0
+ veslvh %v31, %v0, %v0
+ veslvh %v18, %v3, %v20
#CHECK: vesra %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x3a]
#CHECK: vesra %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x3a]
@@ -1544,67 +1762,67 @@
vesrah %v31, %v0, 0
vesrah %v14, %v17, 1074(%r5)
-#CHECK: vesrlv %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x78]
-#CHECK: vesrlv %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x78]
-#CHECK: vesrlv %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x78]
-#CHECK: vesrlv %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x78]
-#CHECK: vesrlv %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x78]
-#CHECK: vesrlv %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0x00,0xba,0x78]
+#CHECK: vesrav %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7a]
+#CHECK: vesrav %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x7a]
+#CHECK: vesrav %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7a]
+#CHECK: vesrav %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7a]
+#CHECK: vesrav %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7a]
+#CHECK: vesrav %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0x00,0xba,0x7a]
- vesrlv %v0, %v0, %v0, 0
- vesrlv %v0, %v0, %v0, 15
- vesrlv %v0, %v0, %v31, 0
- vesrlv %v0, %v31, %v0, 0
- vesrlv %v31, %v0, %v0, 0
- vesrlv %v18, %v3, %v20, 11
+ vesrav %v0, %v0, %v0, 0
+ vesrav %v0, %v0, %v0, 15
+ vesrav %v0, %v0, %v31, 0
+ vesrav %v0, %v31, %v0, 0
+ vesrav %v31, %v0, %v0, 0
+ vesrav %v18, %v3, %v20, 11
-#CHECK: vesrlvb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x78]
-#CHECK: vesrlvb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x78]
-#CHECK: vesrlvb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x78]
-#CHECK: vesrlvb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x78]
-#CHECK: vesrlvb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x78]
+#CHECK: vesravb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7a]
+#CHECK: vesravb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7a]
+#CHECK: vesravb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7a]
+#CHECK: vesravb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7a]
+#CHECK: vesravb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x7a]
- vesrlvb %v0, %v0, %v0
- vesrlvb %v0, %v0, %v31
- vesrlvb %v0, %v31, %v0
- vesrlvb %v31, %v0, %v0
- vesrlvb %v18, %v3, %v20
+ vesravb %v0, %v0, %v0
+ vesravb %v0, %v0, %v31
+ vesravb %v0, %v31, %v0
+ vesravb %v31, %v0, %v0
+ vesravb %v18, %v3, %v20
-#CHECK: vesrlvf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x78]
-#CHECK: vesrlvf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x78]
-#CHECK: vesrlvf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x78]
-#CHECK: vesrlvf %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x78]
-#CHECK: vesrlvf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x78]
+#CHECK: vesravf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x7a]
+#CHECK: vesravf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x7a]
+#CHECK: vesravf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x7a]
+#CHECK: vesravf %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x7a]
+#CHECK: vesravf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x7a]
- vesrlvf %v0, %v0, %v0
- vesrlvf %v0, %v0, %v31
- vesrlvf %v0, %v31, %v0
- vesrlvf %v31, %v0, %v0
- vesrlvf %v18, %v3, %v20
+ vesravf %v0, %v0, %v0
+ vesravf %v0, %v0, %v31
+ vesravf %v0, %v31, %v0
+ vesravf %v31, %v0, %v0
+ vesravf %v18, %v3, %v20
-#CHECK: vesrlvg %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0x78]
-#CHECK: vesrlvg %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x78]
-#CHECK: vesrlvg %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x78]
-#CHECK: vesrlvg %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x78]
-#CHECK: vesrlvg %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x78]
+#CHECK: vesravg %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0x7a]
+#CHECK: vesravg %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x7a]
+#CHECK: vesravg %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x7a]
+#CHECK: vesravg %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x7a]
+#CHECK: vesravg %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x7a]
- vesrlvg %v0, %v0, %v0
- vesrlvg %v0, %v0, %v31
- vesrlvg %v0, %v31, %v0
- vesrlvg %v31, %v0, %v0
- vesrlvg %v18, %v3, %v20
+ vesravg %v0, %v0, %v0
+ vesravg %v0, %v0, %v31
+ vesravg %v0, %v31, %v0
+ vesravg %v31, %v0, %v0
+ vesravg %v18, %v3, %v20
-#CHECK: vesrlvh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x78]
-#CHECK: vesrlvh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x78]
-#CHECK: vesrlvh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x78]
-#CHECK: vesrlvh %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x78]
-#CHECK: vesrlvh %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x78]
+#CHECK: vesravh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x7a]
+#CHECK: vesravh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x7a]
+#CHECK: vesravh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x7a]
+#CHECK: vesravh %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x7a]
+#CHECK: vesravh %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x7a]
- vesrlvh %v0, %v0, %v0
- vesrlvh %v0, %v0, %v31
- vesrlvh %v0, %v31, %v0
- vesrlvh %v31, %v0, %v0
- vesrlvh %v18, %v3, %v20
+ vesravh %v0, %v0, %v0
+ vesravh %v0, %v0, %v31
+ vesravh %v0, %v31, %v0
+ vesravh %v31, %v0, %v0
+ vesravh %v18, %v3, %v20
#CHECK: vesrl %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x38]
#CHECK: vesrl %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x38]
@@ -1678,6 +1896,68 @@
vesrlh %v31, %v0, 0
vesrlh %v14, %v17, 1074(%r5)
+#CHECK: vesrlv %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x78]
+#CHECK: vesrlv %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x78]
+#CHECK: vesrlv %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x78]
+#CHECK: vesrlv %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x78]
+#CHECK: vesrlv %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x78]
+#CHECK: vesrlv %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0x00,0xba,0x78]
+
+ vesrlv %v0, %v0, %v0, 0
+ vesrlv %v0, %v0, %v0, 15
+ vesrlv %v0, %v0, %v31, 0
+ vesrlv %v0, %v31, %v0, 0
+ vesrlv %v31, %v0, %v0, 0
+ vesrlv %v18, %v3, %v20, 11
+
+#CHECK: vesrlvb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x78]
+#CHECK: vesrlvb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x78]
+#CHECK: vesrlvb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x78]
+#CHECK: vesrlvb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x78]
+#CHECK: vesrlvb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x78]
+
+ vesrlvb %v0, %v0, %v0
+ vesrlvb %v0, %v0, %v31
+ vesrlvb %v0, %v31, %v0
+ vesrlvb %v31, %v0, %v0
+ vesrlvb %v18, %v3, %v20
+
+#CHECK: vesrlvf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x78]
+#CHECK: vesrlvf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x78]
+#CHECK: vesrlvf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x78]
+#CHECK: vesrlvf %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x78]
+#CHECK: vesrlvf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x78]
+
+ vesrlvf %v0, %v0, %v0
+ vesrlvf %v0, %v0, %v31
+ vesrlvf %v0, %v31, %v0
+ vesrlvf %v31, %v0, %v0
+ vesrlvf %v18, %v3, %v20
+
+#CHECK: vesrlvg %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0x78]
+#CHECK: vesrlvg %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x78]
+#CHECK: vesrlvg %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x78]
+#CHECK: vesrlvg %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x78]
+#CHECK: vesrlvg %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x78]
+
+ vesrlvg %v0, %v0, %v0
+ vesrlvg %v0, %v0, %v31
+ vesrlvg %v0, %v31, %v0
+ vesrlvg %v31, %v0, %v0
+ vesrlvg %v18, %v3, %v20
+
+#CHECK: vesrlvh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x78]
+#CHECK: vesrlvh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x78]
+#CHECK: vesrlvh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x78]
+#CHECK: vesrlvh %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x78]
+#CHECK: vesrlvh %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x78]
+
+ vesrlvh %v0, %v0, %v0
+ vesrlvh %v0, %v0, %v31
+ vesrlvh %v0, %v31, %v0
+ vesrlvh %v31, %v0, %v0
+ vesrlvh %v18, %v3, %v20
+
#CHECK: vfa %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xe3]
#CHECK: vfa %v0, %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xe3]
#CHECK: vfa %v0, %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xe3]
@@ -2040,19 +2320,19 @@
vfeezb %v18, %v3, %v20
vfeezbs %v5, %v22, %v7
-#CFECK: vfeef %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x80]
-#CFECK: vfeef %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x80]
-#CFECK: vfeef %v0, %v0, %v0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x80]
-#CFECK: vfeef %v0, %v0, %v15, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x80]
-#CFECK: vfeef %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x80]
-#CFECK: vfeef %v0, %v15, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x80]
-#CFECK: vfeef %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x80]
-#CFECK: vfeef %v15, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x80]
-#CFECK: vfeef %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x80]
-#CFECK: vfeef %v18, %v3, %v20, 0 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x80]
-#CFECK: vfeefs %v5, %v22, %v7 # encoding: [0xe7,0x56,0x70,0x10,0x24,0x80]
-#CFECK: vfeezf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x20,0x2a,0x80]
-#CFECK: vfeezfs %v5, %v22, %v7 # encoding: [0xe7,0x56,0x70,0x30,0x24,0x80]
+#CHECK: vfeef %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x80]
+#CHECK: vfeef %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x80]
+#CHECK: vfeef %v0, %v0, %v0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x80]
+#CHECK: vfeef %v0, %v0, %v15, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x80]
+#CHECK: vfeef %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x80]
+#CHECK: vfeef %v0, %v15, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x80]
+#CHECK: vfeef %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x80]
+#CHECK: vfeef %v15, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x80]
+#CHECK: vfeef %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x80]
+#CHECK: vfeef %v18, %v3, %v20, 0 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x80]
+#CHECK: vfeefs %v5, %v22, %v7 # encoding: [0xe7,0x56,0x70,0x10,0x24,0x80]
+#CHECK: vfeezf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x20,0x2a,0x80]
+#CHECK: vfeezfs %v5, %v22, %v7 # encoding: [0xe7,0x56,0x70,0x30,0x24,0x80]
vfeef %v0, %v0, %v0
vfeef %v0, %v0, %v0, 0
@@ -2152,19 +2432,19 @@
vfenezb %v18, %v3, %v20
vfenezbs %v5, %v22, %v7
-#CFECK: vfenef %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x81]
-#CFECK: vfenef %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x81]
-#CFECK: vfenef %v0, %v0, %v0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x81]
-#CFECK: vfenef %v0, %v0, %v15, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x81]
-#CFECK: vfenef %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x81]
-#CFECK: vfenef %v0, %v15, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x81]
-#CFECK: vfenef %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x81]
-#CFECK: vfenef %v15, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x81]
-#CFECK: vfenef %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x81]
-#CFECK: vfenef %v18, %v3, %v20, 0 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x81]
-#CFECK: vfenefs %v5, %v22, %v7 # encoding: [0xe7,0x56,0x70,0x10,0x24,0x81]
-#CFECK: vfenezf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x20,0x2a,0x81]
-#CFECK: vfenezfs %v5, %v22, %v7 # encoding: [0xe7,0x56,0x70,0x30,0x24,0x81]
+#CHECK: vfenef %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x81]
+#CHECK: vfenef %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x81]
+#CHECK: vfenef %v0, %v0, %v0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x81]
+#CHECK: vfenef %v0, %v0, %v15, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x81]
+#CHECK: vfenef %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x81]
+#CHECK: vfenef %v0, %v15, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x81]
+#CHECK: vfenef %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x81]
+#CHECK: vfenef %v15, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x81]
+#CHECK: vfenef %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x81]
+#CHECK: vfenef %v18, %v3, %v20, 0 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x81]
+#CHECK: vfenefs %v5, %v22, %v7 # encoding: [0xe7,0x56,0x70,0x10,0x24,0x81]
+#CHECK: vfenezf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x20,0x2a,0x81]
+#CHECK: vfenezfs %v5, %v22, %v7 # encoding: [0xe7,0x56,0x70,0x30,0x24,0x81]
vfenef %v0, %v0, %v0
vfenef %v0, %v0, %v0, 0
@@ -2242,123 +2522,6 @@
vfidb %v31, %v0, 0, 0
vfidb %v14, %v17, 4, 10
-#CHECK: vistr %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
-#CHECK: vistr %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x5c]
-#CHECK: vistr %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
-#CHECK: vistr %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x5c]
-#CHECK: vistr %v0, %v0, 0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x00,0x5c]
-#CHECK: vistr %v0, %v15, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5c]
-#CHECK: vistr %v0, %v31, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x5c]
-#CHECK: vistr %v15, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x5c]
-#CHECK: vistr %v31, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x5c]
-#CHECK: vistr %v18, %v3, 11, 9 # encoding: [0xe7,0x23,0x00,0x90,0xb8,0x5c]
-
- vistr %v0, %v0, 0
- vistr %v0, %v0, 15
- vistr %v0, %v0, 0, 0
- vistr %v0, %v0, 15, 0
- vistr %v0, %v0, 0, 12
- vistr %v0, %v15, 0
- vistr %v0, %v31, 0
- vistr %v15, %v0, 0
- vistr %v31, %v0, 0
- vistr %v18, %v3, 11, 9
-
-#CHECK: vistrb %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
-#CHECK: vistrb %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
-#CHECK: vistrb %v0, %v0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x00,0x5c]
-#CHECK: vistrb %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5c]
-#CHECK: vistrb %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x5c]
-#CHECK: vistrb %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x5c]
-#CHECK: vistrb %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x5c]
-#CHECK: vistrb %v18, %v3, 0 # encoding: [0xe7,0x23,0x00,0x00,0x08,0x5c]
-#CHECK: vistrbs %v5, %v22 # encoding: [0xe7,0x56,0x00,0x10,0x04,0x5c]
-
- vistrb %v0, %v0
- vistrb %v0, %v0, 0
- vistrb %v0, %v0, 12
- vistrb %v0, %v15
- vistrb %v0, %v31
- vistrb %v15, %v0
- vistrb %v31, %v0
- vistrb %v18, %v3
- vistrbs %v5, %v22
-
-#CBECK: vistrf %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x5c]
-#CBECK: vistrf %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x5c]
-#CBECK: vistrf %v0, %v0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x5c]
-#CBECK: vistrf %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x5c]
-#CBECK: vistrf %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x5c]
-#CBECK: vistrf %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x5c]
-#CBECK: vistrf %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x5c]
-#CBECK: vistrf %v18, %v3, 0 # encoding: [0xe7,0x23,0x00,0x00,0x28,0x5c]
-#CBECK: vistrfs %v5, %v22 # encoding: [0xe7,0x56,0x00,0x10,0x24,0x5c]
-
- vistrf %v0, %v0
- vistrf %v0, %v0, 0
- vistrf %v0, %v0, 12
- vistrf %v0, %v15
- vistrf %v0, %v31
- vistrf %v15, %v0
- vistrf %v31, %v0
- vistrf %v18, %v3
- vistrfs %v5, %v22
-
-#CHECK: vistrh %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x5c]
-#CHECK: vistrh %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x5c]
-#CHECK: vistrh %v0, %v0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x10,0x5c]
-#CHECK: vistrh %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x5c]
-#CHECK: vistrh %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x5c]
-#CHECK: vistrh %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x5c]
-#CHECK: vistrh %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x5c]
-#CHECK: vistrh %v18, %v3, 0 # encoding: [0xe7,0x23,0x00,0x00,0x18,0x5c]
-#CHECK: vistrhs %v5, %v22 # encoding: [0xe7,0x56,0x00,0x10,0x14,0x5c]
-
- vistrh %v0, %v0
- vistrh %v0, %v0, 0
- vistrh %v0, %v0, 12
- vistrh %v0, %v15
- vistrh %v0, %v31
- vistrh %v15, %v0
- vistrh %v31, %v0
- vistrh %v18, %v3
- vistrhs %v5, %v22
-
-#CHECK: vfpso %v0, %v0, 0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xcc]
-#CHECK: vfpso %v0, %v0, 15, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xcc]
-#CHECK: vfpso %v0, %v0, 0, 15, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xcc]
-#CHECK: vfpso %v0, %v0, 0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xcc]
-#CHECK: vfpso %v0, %v15, 0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xcc]
-#CHECK: vfpso %v0, %v31, 0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xcc]
-#CHECK: vfpso %v15, %v0, 0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xcc]
-#CHECK: vfpso %v31, %v0, 0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xcc]
-#CHECK: vfpso %v14, %v17, 11, 9, 7 # encoding: [0xe7,0xe1,0x00,0x79,0xb4,0xcc]
-
- vfpso %v0, %v0, 0, 0, 0
- vfpso %v0, %v0, 15, 0, 0
- vfpso %v0, %v0, 0, 15, 0
- vfpso %v0, %v0, 0, 0, 15
- vfpso %v0, %v15, 0, 0, 0
- vfpso %v0, %v31, 0, 0, 0
- vfpso %v15, %v0, 0, 0, 0
- vfpso %v31, %v0, 0, 0, 0
- vfpso %v14, %v17, 11, 9, 7
-
-#CHECK: vfpsodb %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xcc]
-#CHECK: vfpsodb %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xcc]
-#CHECK: vfpsodb %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xcc]
-#CHECK: vfpsodb %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xcc]
-#CHECK: vfpsodb %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xcc]
-#CHECK: vfpsodb %v14, %v17, 7 # encoding: [0xe7,0xe1,0x00,0x70,0x34,0xcc]
-
- vfpsodb %v0, %v0, 0
- vfpsodb %v0, %v0, 15
- vfpsodb %v0, %v15, 0
- vfpsodb %v0, %v31, 0
- vfpsodb %v15, %v0, 0
- vfpsodb %v31, %v0, 0
- vfpsodb %v14, %v17, 7
-
#CHECK: vflcdb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0xcc]
#CHECK: vflcdb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xcc]
#CHECK: vflcdb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xcc]
@@ -2401,6 +2564,22 @@
vflpdb %v31, %v0
vflpdb %v14, %v17
+#CHECK: vfm %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xe7]
+#CHECK: vfm %v0, %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xe7]
+#CHECK: vfm %v0, %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xe7]
+#CHECK: vfm %v0, %v0, %v31, 0, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xe7]
+#CHECK: vfm %v0, %v31, %v0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xe7]
+#CHECK: vfm %v31, %v0, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xe7]
+#CHECK: vfm %v18, %v3, %v20, 11, 9 # encoding: [0xe7,0x23,0x40,0x09,0xba,0xe7]
+
+ vfm %v0, %v0, %v0, 0, 0
+ vfm %v0, %v0, %v0, 15, 0
+ vfm %v0, %v0, %v0, 0, 15
+ vfm %v0, %v0, %v31, 0, 0
+ vfm %v0, %v31, %v0, 0, 0
+ vfm %v31, %v0, %v0, 0, 0
+ vfm %v18, %v3, %v20, 11, 9
+
#CHECK: vfma %v0, %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x8f]
#CHECK: vfma %v0, %v0, %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x0f,0x00,0x00,0x8f]
#CHECK: vfma %v0, %v0, %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0x8f]
@@ -2433,22 +2612,6 @@
vfmadb %v31, %v0, %v0, %v0
vfmadb %v13, %v17, %v21, %v25
-#CHECK: vfm %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xe7]
-#CHECK: vfm %v0, %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xe7]
-#CHECK: vfm %v0, %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xe7]
-#CHECK: vfm %v0, %v0, %v31, 0, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xe7]
-#CHECK: vfm %v0, %v31, %v0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xe7]
-#CHECK: vfm %v31, %v0, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xe7]
-#CHECK: vfm %v18, %v3, %v20, 11, 9 # encoding: [0xe7,0x23,0x40,0x09,0xba,0xe7]
-
- vfm %v0, %v0, %v0, 0, 0
- vfm %v0, %v0, %v0, 15, 0
- vfm %v0, %v0, %v0, 0, 15
- vfm %v0, %v0, %v31, 0, 0
- vfm %v0, %v31, %v0, 0, 0
- vfm %v31, %v0, %v0, 0, 0
- vfm %v18, %v3, %v20, 11, 9
-
#CHECK: vfmdb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x30,0xe7]
#CHECK: vfmdb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xe7]
#CHECK: vfmdb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xe7]
@@ -2493,6 +2656,41 @@
vfmsdb %v31, %v0, %v0, %v0
vfmsdb %v13, %v17, %v21, %v25
+#CHECK: vfpso %v0, %v0, 0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xcc]
+#CHECK: vfpso %v0, %v0, 15, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xcc]
+#CHECK: vfpso %v0, %v0, 0, 15, 0 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xcc]
+#CHECK: vfpso %v0, %v0, 0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf0,0x00,0xcc]
+#CHECK: vfpso %v0, %v15, 0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xcc]
+#CHECK: vfpso %v0, %v31, 0, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xcc]
+#CHECK: vfpso %v15, %v0, 0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xcc]
+#CHECK: vfpso %v31, %v0, 0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xcc]
+#CHECK: vfpso %v14, %v17, 11, 9, 7 # encoding: [0xe7,0xe1,0x00,0x79,0xb4,0xcc]
+
+ vfpso %v0, %v0, 0, 0, 0
+ vfpso %v0, %v0, 15, 0, 0
+ vfpso %v0, %v0, 0, 15, 0
+ vfpso %v0, %v0, 0, 0, 15
+ vfpso %v0, %v15, 0, 0, 0
+ vfpso %v0, %v31, 0, 0, 0
+ vfpso %v15, %v0, 0, 0, 0
+ vfpso %v31, %v0, 0, 0, 0
+ vfpso %v14, %v17, 11, 9, 7
+
+#CHECK: vfpsodb %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xcc]
+#CHECK: vfpsodb %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xcc]
+#CHECK: vfpsodb %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xcc]
+#CHECK: vfpsodb %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xcc]
+#CHECK: vfpsodb %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xcc]
+#CHECK: vfpsodb %v14, %v17, 7 # encoding: [0xe7,0xe1,0x00,0x70,0x34,0xcc]
+
+ vfpsodb %v0, %v0, 0
+ vfpsodb %v0, %v0, 15
+ vfpsodb %v0, %v15, 0
+ vfpsodb %v0, %v31, 0
+ vfpsodb %v15, %v0, 0
+ vfpsodb %v31, %v0, 0
+ vfpsodb %v14, %v17, 7
+
#CHECK: vfs %v0, %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xe2]
#CHECK: vfs %v0, %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xe2]
#CHECK: vfs %v0, %v0, %v0, 0, 15 # encoding: [0xe7,0x00,0x00,0x0f,0x00,0xe2]
@@ -2645,6 +2843,20 @@
vgeg %v31, 0(%v0,%r1), 0
vgeg %v10, 1000(%v19,%r7), 1
+#CHECK: vgfm %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xb4]
+#CHECK: vgfm %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xb4]
+#CHECK: vgfm %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xb4]
+#CHECK: vgfm %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xb4]
+#CHECK: vgfm %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xb4]
+#CHECK: vgfm %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0x00,0xba,0xb4]
+
+ vgfm %v0, %v0, %v0, 0
+ vgfm %v0, %v0, %v0, 15
+ vgfm %v0, %v0, %v31, 0
+ vgfm %v0, %v31, %v0, 0
+ vgfm %v31, %v0, %v0, 0
+ vgfm %v18, %v3, %v20, 11
+
#CHECK: vgfma %v0, %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xbc]
#CHECK: vgfma %v0, %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x0f,0x00,0x00,0xbc]
#CHECK: vgfma %v0, %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xbc]
@@ -2717,20 +2929,6 @@
vgfmah %v31, %v0, %v0, %v0
vgfmah %v13, %v17, %v21, %v25
-#CHECK: vgfm %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xb4]
-#CHECK: vgfm %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xb4]
-#CHECK: vgfm %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xb4]
-#CHECK: vgfm %v0, %v31, %v0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xb4]
-#CHECK: vgfm %v31, %v0, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xb4]
-#CHECK: vgfm %v18, %v3, %v20, 11 # encoding: [0xe7,0x23,0x40,0x00,0xba,0xb4]
-
- vgfm %v0, %v0, %v0, 0
- vgfm %v0, %v0, %v0, 15
- vgfm %v0, %v0, %v31, 0
- vgfm %v0, %v31, %v0, 0
- vgfm %v31, %v0, %v0, 0
- vgfm %v18, %v3, %v20, 11
-
#CHECK: vgfmb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xb4]
#CHECK: vgfmb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xb4]
#CHECK: vgfmb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xb4]
@@ -2851,6 +3049,88 @@
vgmh %v31, 0, 0
vgmh %v21, 2, 3
+#CHECK: vistr %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
+#CHECK: vistr %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x5c]
+#CHECK: vistr %v0, %v0, 0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
+#CHECK: vistr %v0, %v0, 15, 0 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x5c]
+#CHECK: vistr %v0, %v0, 0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x00,0x5c]
+#CHECK: vistr %v0, %v15, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5c]
+#CHECK: vistr %v0, %v31, 0, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x5c]
+#CHECK: vistr %v15, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x5c]
+#CHECK: vistr %v31, %v0, 0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x5c]
+#CHECK: vistr %v18, %v3, 11, 9 # encoding: [0xe7,0x23,0x00,0x90,0xb8,0x5c]
+
+ vistr %v0, %v0, 0
+ vistr %v0, %v0, 15
+ vistr %v0, %v0, 0, 0
+ vistr %v0, %v0, 15, 0
+ vistr %v0, %v0, 0, 12
+ vistr %v0, %v15, 0
+ vistr %v0, %v31, 0
+ vistr %v15, %v0, 0
+ vistr %v31, %v0, 0
+ vistr %v18, %v3, 11, 9
+
+#CHECK: vistrb %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
+#CHECK: vistrb %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
+#CHECK: vistrb %v0, %v0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x00,0x5c]
+#CHECK: vistrb %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5c]
+#CHECK: vistrb %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x5c]
+#CHECK: vistrb %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x5c]
+#CHECK: vistrb %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x5c]
+#CHECK: vistrb %v18, %v3, 0 # encoding: [0xe7,0x23,0x00,0x00,0x08,0x5c]
+#CHECK: vistrbs %v5, %v22 # encoding: [0xe7,0x56,0x00,0x10,0x04,0x5c]
+
+ vistrb %v0, %v0
+ vistrb %v0, %v0, 0
+ vistrb %v0, %v0, 12
+ vistrb %v0, %v15
+ vistrb %v0, %v31
+ vistrb %v15, %v0
+ vistrb %v31, %v0
+ vistrb %v18, %v3
+ vistrbs %v5, %v22
+
+#CHECK: vistrf %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x5c]
+#CHECK: vistrf %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x5c]
+#CHECK: vistrf %v0, %v0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x5c]
+#CHECK: vistrf %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x5c]
+#CHECK: vistrf %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x5c]
+#CHECK: vistrf %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x5c]
+#CHECK: vistrf %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x5c]
+#CHECK: vistrf %v18, %v3, 0 # encoding: [0xe7,0x23,0x00,0x00,0x28,0x5c]
+#CHECK: vistrfs %v5, %v22 # encoding: [0xe7,0x56,0x00,0x10,0x24,0x5c]
+
+ vistrf %v0, %v0
+ vistrf %v0, %v0, 0
+ vistrf %v0, %v0, 12
+ vistrf %v0, %v15
+ vistrf %v0, %v31
+ vistrf %v15, %v0
+ vistrf %v31, %v0
+ vistrf %v18, %v3
+ vistrfs %v5, %v22
+
+#CHECK: vistrh %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x5c]
+#CHECK: vistrh %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x5c]
+#CHECK: vistrh %v0, %v0, 12 # encoding: [0xe7,0x00,0x00,0xc0,0x10,0x5c]
+#CHECK: vistrh %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x5c]
+#CHECK: vistrh %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x5c]
+#CHECK: vistrh %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x5c]
+#CHECK: vistrh %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x5c]
+#CHECK: vistrh %v18, %v3, 0 # encoding: [0xe7,0x23,0x00,0x00,0x18,0x5c]
+#CHECK: vistrhs %v5, %v22 # encoding: [0xe7,0x56,0x00,0x10,0x14,0x5c]
+
+ vistrh %v0, %v0
+ vistrh %v0, %v0, 0
+ vistrh %v0, %v0, 12
+ vistrh %v0, %v15
+ vistrh %v0, %v31
+ vistrh %v15, %v0
+ vistrh %v31, %v0
+ vistrh %v18, %v3
+ vistrhs %v5, %v22
+
#CHECK: vl %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x06]
#CHECK: vl %v0, 4095 # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x06]
#CHECK: vl %v0, 0(%r15) # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x06]
@@ -5511,20 +5791,6 @@
vsceg %v31, 0(%v0,%r1), 0
vsceg %v10, 1000(%v19,%r7), 1
-#CHECK: vsel %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x8d]
-#CHECK: vsel %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x8d]
-#CHECK: vsel %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x8d]
-#CHECK: vsel %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x8d]
-#CHECK: vsel %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x8d]
-#CHECK: vsel %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x50,0x00,0x97,0x8d]
-
- vsel %v0, %v0, %v0, %v0
- vsel %v0, %v0, %v0, %v31
- vsel %v0, %v0, %v31, %v0
- vsel %v0, %v31, %v0, %v0
- vsel %v31, %v0, %v0, %v0
- vsel %v13, %v17, %v21, %v25
-
#CHECK: vseg %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5f]
#CHECK: vseg %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x5f]
#CHECK: vseg %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5f]
@@ -5583,6 +5849,20 @@
vsegh %v31, %v0
vsegh %v14, %v17
+#CHECK: vsel %v0, %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x8d]
+#CHECK: vsel %v0, %v0, %v0, %v31 # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x8d]
+#CHECK: vsel %v0, %v0, %v31, %v0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x8d]
+#CHECK: vsel %v0, %v31, %v0, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x8d]
+#CHECK: vsel %v31, %v0, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x8d]
+#CHECK: vsel %v13, %v17, %v21, %v25 # encoding: [0xe7,0xd1,0x50,0x00,0x97,0x8d]
+
+ vsel %v0, %v0, %v0, %v0
+ vsel %v0, %v0, %v0, %v31
+ vsel %v0, %v0, %v31, %v0
+ vsel %v0, %v31, %v0, %v0
+ vsel %v31, %v0, %v0, %v0
+ vsel %v13, %v17, %v21, %v25
+
#CHECK: vsf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xf7]
#CHECK: vsf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xf7]
#CHECK: vsf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xf7]
@@ -5989,6 +6269,18 @@
vsum %v31, %v0, %v0, 0
vsum %v18, %v3, %v20, 11
+#CHECK: vsumb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x64]
+#CHECK: vsumb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x64]
+#CHECK: vsumb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x64]
+#CHECK: vsumb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x64]
+#CHECK: vsumb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x64]
+
+ vsumb %v0, %v0, %v0
+ vsumb %v0, %v0, %v31
+ vsumb %v0, %v31, %v0
+ vsumb %v31, %v0, %v0
+ vsumb %v18, %v3, %v20
+
#CHECK: vsumg %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x65]
#CHECK: vsumg %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x65]
#CHECK: vsumg %v0, %v0, %v31, 0 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x65]
@@ -6003,6 +6295,18 @@
vsumg %v31, %v0, %v0, 0
vsumg %v18, %v3, %v20, 11
+#CHECK: vsumgf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x65]
+#CHECK: vsumgf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x65]
+#CHECK: vsumgf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x65]
+#CHECK: vsumgf %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x65]
+#CHECK: vsumgf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x65]
+
+ vsumgf %v0, %v0, %v0
+ vsumgf %v0, %v0, %v31
+ vsumgf %v0, %v31, %v0
+ vsumgf %v31, %v0, %v0
+ vsumgf %v18, %v3, %v20
+
#CHECK: vsumgh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x65]
#CHECK: vsumgh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x65]
#CHECK: vsumgh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x65]
@@ -6015,17 +6319,17 @@
vsumgh %v31, %v0, %v0
vsumgh %v18, %v3, %v20
-#CHECK: vsumgf %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0x65]
-#CHECK: vsumgf %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x65]
-#CHECK: vsumgf %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x65]
-#CHECK: vsumgf %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x65]
-#CHECK: vsumgf %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x65]
+#CHECK: vsumh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x64]
+#CHECK: vsumh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x64]
+#CHECK: vsumh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x64]
+#CHECK: vsumh %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x64]
+#CHECK: vsumh %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x64]
- vsumgf %v0, %v0, %v0
- vsumgf %v0, %v0, %v31
- vsumgf %v0, %v31, %v0
- vsumgf %v31, %v0, %v0
- vsumgf %v18, %v3, %v20
+ vsumh %v0, %v0, %v0
+ vsumh %v0, %v0, %v31
+ vsumh %v0, %v31, %v0
+ vsumh %v31, %v0, %v0
+ vsumh %v18, %v3, %v20
#CHECK: vsumq %v0, %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x67]
#CHECK: vsumq %v0, %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x67]
@@ -6065,30 +6369,6 @@
vsumqg %v31, %v0, %v0
vsumqg %v18, %v3, %v20
-#CHECK: vsumb %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0x64]
-#CHECK: vsumb %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x64]
-#CHECK: vsumb %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x64]
-#CHECK: vsumb %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x64]
-#CHECK: vsumb %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x64]
-
- vsumb %v0, %v0, %v0
- vsumb %v0, %v0, %v31
- vsumb %v0, %v31, %v0
- vsumb %v31, %v0, %v0
- vsumb %v18, %v3, %v20
-
-#CHECK: vsumh %v0, %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0x64]
-#CHECK: vsumh %v0, %v0, %v31 # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x64]
-#CHECK: vsumh %v0, %v31, %v0 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x64]
-#CHECK: vsumh %v31, %v0, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x64]
-#CHECK: vsumh %v18, %v3, %v20 # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x64]
-
- vsumh %v0, %v0, %v0
- vsumh %v0, %v0, %v31
- vsumh %v0, %v31, %v0
- vsumh %v31, %v0, %v0
- vsumh %v18, %v3, %v20
-
#CHECK: vtm %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd8]
#CHECK: vtm %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd8]
#CHECK: vtm %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd8]
@@ -6161,6 +6441,50 @@
vuphh %v31, %v0
vuphh %v14, %v17
+#CHECK: vupl %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd6]
+#CHECK: vupl %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xd6]
+#CHECK: vupl %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd6]
+#CHECK: vupl %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd6]
+#CHECK: vupl %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd6]
+#CHECK: vupl %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd6]
+#CHECK: vupl %v14, %v17, 11 # encoding: [0xe7,0xe1,0x00,0x00,0xb4,0xd6]
+
+ vupl %v0, %v0, 0
+ vupl %v0, %v0, 15
+ vupl %v0, %v15, 0
+ vupl %v0, %v31, 0
+ vupl %v15, %v0, 0
+ vupl %v31, %v0, 0
+ vupl %v14, %v17, 11
+
+#CHECK: vuplb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd6]
+#CHECK: vuplb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd6]
+#CHECK: vuplb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd6]
+#CHECK: vuplb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd6]
+#CHECK: vuplb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd6]
+#CHECK: vuplb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xd6]
+
+ vuplb %v0, %v0
+ vuplb %v0, %v15
+ vuplb %v0, %v31
+ vuplb %v15, %v0
+ vuplb %v31, %v0
+ vuplb %v14, %v17
+
+#CHECK: vuplf %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xd6]
+#CHECK: vuplf %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xd6]
+#CHECK: vuplf %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xd6]
+#CHECK: vuplf %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xd6]
+#CHECK: vuplf %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xd6]
+#CHECK: vuplf %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xd6]
+
+ vuplf %v0, %v0
+ vuplf %v0, %v15
+ vuplf %v0, %v31
+ vuplf %v15, %v0
+ vuplf %v31, %v0
+ vuplf %v14, %v17
+
#CHECK: vuplh %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd5]
#CHECK: vuplh %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xd5]
#CHECK: vuplh %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd5]
@@ -6219,50 +6543,6 @@
vuplhh %v31, %v0
vuplhh %v14, %v17
-#CHECK: vupl %v0, %v0, 0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd6]
-#CHECK: vupl %v0, %v0, 15 # encoding: [0xe7,0x00,0x00,0x00,0xf0,0xd6]
-#CHECK: vupl %v0, %v15, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd6]
-#CHECK: vupl %v0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd6]
-#CHECK: vupl %v15, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd6]
-#CHECK: vupl %v31, %v0, 0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd6]
-#CHECK: vupl %v14, %v17, 11 # encoding: [0xe7,0xe1,0x00,0x00,0xb4,0xd6]
-
- vupl %v0, %v0, 0
- vupl %v0, %v0, 15
- vupl %v0, %v15, 0
- vupl %v0, %v31, 0
- vupl %v15, %v0, 0
- vupl %v31, %v0, 0
- vupl %v14, %v17, 11
-
-#CHECK: vuplb %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd6]
-#CHECK: vuplb %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd6]
-#CHECK: vuplb %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd6]
-#CHECK: vuplb %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd6]
-#CHECK: vuplb %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd6]
-#CHECK: vuplb %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xd6]
-
- vuplb %v0, %v0
- vuplb %v0, %v15
- vuplb %v0, %v31
- vuplb %v15, %v0
- vuplb %v31, %v0
- vuplb %v14, %v17
-
-#CHECK: vuplf %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x20,0xd6]
-#CHECK: vuplf %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xd6]
-#CHECK: vuplf %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xd6]
-#CHECK: vuplf %v15, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xd6]
-#CHECK: vuplf %v31, %v0 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xd6]
-#CHECK: vuplf %v14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xd6]
-
- vuplf %v0, %v0
- vuplf %v0, %v15
- vuplf %v0, %v31
- vuplf %v15, %v0
- vuplf %v31, %v0
- vuplf %v14, %v17
-
#CHECK: vuplhw %v0, %v0 # encoding: [0xe7,0x00,0x00,0x00,0x10,0xd6]
#CHECK: vuplhw %v0, %v15 # encoding: [0xe7,0x0f,0x00,0x00,0x10,0xd6]
#CHECK: vuplhw %v0, %v31 # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xd6]
@@ -6358,7 +6638,7 @@
vzero %v31
#CHECK: wcdgb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc3]
-#CHECK: wcdgb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc3]
+#CHECK: wcdgb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc3]
#CHECK: wcdgb %f0, %f0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc3]
#CHECK: wcdgb %f0, %f0, 4, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc3]
#CHECK: wcdgb %f0, %f0, 12, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc3]
@@ -6376,7 +6656,7 @@
wcdgb %v14, %v17, 4, 10
#CHECK: wcdlgb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc1]
-#CHECK: wcdlgb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc1]
+#CHECK: wcdlgb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc1]
#CHECK: wcdlgb %f0, %f0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc1]
#CHECK: wcdlgb %f0, %f0, 4, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc1]
#CHECK: wcdlgb %f0, %f0, 12, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc1]
@@ -6394,7 +6674,7 @@
wcdlgb %v14, %v17, 4, 10
#CHECK: wcgdb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc2]
-#CHECK: wcgdb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc2]
+#CHECK: wcgdb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc2]
#CHECK: wcgdb %f0, %f0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc2]
#CHECK: wcgdb %f0, %f0, 4, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc2]
#CHECK: wcgdb %f0, %f0, 12, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc2]
@@ -6412,7 +6692,7 @@
wcgdb %v14, %v17, 4, 10
#CHECK: wclgdb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc0]
-#CHECK: wclgdb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc0]
+#CHECK: wclgdb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc0]
#CHECK: wclgdb %f0, %f0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc0]
#CHECK: wclgdb %f0, %f0, 4, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc0]
#CHECK: wclgdb %f0, %f0, 12, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc0]
@@ -6470,7 +6750,7 @@
#CHECK: wfcdb %f15, %f0 # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xcb]
#CHECK: wfcdb %v31, %f0 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xcb]
#CHECK: wfcdb %f14, %v17 # encoding: [0xe7,0xe1,0x00,0x00,0x34,0xcb]
-
+
wfcdb %v0, %v0
wfcdb %f0, %f0
wfcdb %v0, %v15
@@ -6527,7 +6807,7 @@
#CHECK: wfchdbs %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x18,0x34,0xeb]
#CHECK: wfchdbs %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x18,0x38,0xeb]
#CHECK: wfchdbs %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x18,0x3a,0xeb]
-
+
wfchdbs %v0, %v0, %v0
wfchdbs %f0, %f0, %f0
wfchdbs %v0, %v0, %v31
@@ -6578,7 +6858,7 @@
wfddb %v18, %v3, %v20
#CHECK: wfidb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc7]
-#CHECK: wfidb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc7]
+#CHECK: wfidb %f0, %f0, 0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc7]
#CHECK: wfidb %f0, %f0, 0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc7]
#CHECK: wfidb %f0, %f0, 4, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc7]
#CHECK: wfidb %f0, %f0, 12, 0 # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc7]
@@ -6631,24 +6911,6 @@
wfkdb %v31, %v0
wfkdb %v14, %v17
-#CHECK: wfpsodb %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
-#CHECK: wfpsodb %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
-#CHECK: wfpsodb %f0, %f0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xcc]
-#CHECK: wfpsodb %f0, %f15, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xcc]
-#CHECK: wfpsodb %f0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xcc]
-#CHECK: wfpsodb %f15, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xcc]
-#CHECK: wfpsodb %v31, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xcc]
-#CHECK: wfpsodb %f14, %v17, 7 # encoding: [0xe7,0xe1,0x00,0x78,0x34,0xcc]
-
- wfpsodb %v0, %v0, 0
- wfpsodb %f0, %f0, 0
- wfpsodb %v0, %v0, 15
- wfpsodb %v0, %v15, 0
- wfpsodb %v0, %v31, 0
- wfpsodb %v15, %v0, 0
- wfpsodb %v31, %v0, 0
- wfpsodb %v14, %v17, 7
-
#CHECK: wflcdb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
#CHECK: wflcdb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
#CHECK: wflcdb %f0, %f15 # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xcc]
@@ -6743,20 +7005,38 @@
wfmsdb %v31, %v0, %v0, %v0
wfmsdb %v13, %v17, %v21, %v25
+#CHECK: wfpsodb %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
+#CHECK: wfpsodb %f0, %f0, 0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
+#CHECK: wfpsodb %f0, %f0, 15 # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xcc]
+#CHECK: wfpsodb %f0, %f15, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xcc]
+#CHECK: wfpsodb %f0, %v31, 0 # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xcc]
+#CHECK: wfpsodb %f15, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xcc]
+#CHECK: wfpsodb %v31, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xcc]
+#CHECK: wfpsodb %f14, %v17, 7 # encoding: [0xe7,0xe1,0x00,0x78,0x34,0xcc]
+
+ wfpsodb %v0, %v0, 0
+ wfpsodb %f0, %f0, 0
+ wfpsodb %v0, %v0, 15
+ wfpsodb %v0, %v15, 0
+ wfpsodb %v0, %v31, 0
+ wfpsodb %v15, %v0, 0
+ wfpsodb %v31, %v0, 0
+ wfpsodb %v14, %v17, 7
+
#CHECK: wfsdb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xe2]
#CHECK: wfsdb %f0, %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xe2]
#CHECK: wfsdb %f0, %f0, %v31 # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xe2]
#CHECK: wfsdb %f0, %v31, %f0 # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xe2]
#CHECK: wfsdb %v31, %f0, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xe2]
#CHECK: wfsdb %v18, %f3, %v20 # encoding: [0xe7,0x23,0x40,0x08,0x3a,0xe2]
-
+
wfsdb %v0, %v0, %v0
wfsdb %f0, %f0, %f0
wfsdb %v0, %v0, %v31
wfsdb %v0, %v31, %v0
wfsdb %v31, %v0, %v0
wfsdb %v18, %v3, %v20
-
+
#CHECK: wfsqdb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xce]
#CHECK: wfsqdb %f0, %f0 # encoding: [0xe7,0x00,0x00,0x08,0x30,0xce]
#CHECK: wfsqdb %f0, %f15 # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xce]
@@ -6764,7 +7044,7 @@
#CHECK: wfsqdb %f15, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xce]
#CHECK: wfsqdb %v31, %f0 # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xce]
#CHECK: wfsqdb %f14, %v17 # encoding: [0xe7,0xe1,0x00,0x08,0x34,0xce]
-
+
wfsqdb %v0, %v0
wfsqdb %f0, %f0
wfsqdb %v0, %v15
@@ -6781,7 +7061,7 @@
#CHECK: wftcidb %f15, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x30,0x4a]
#CHECK: wftcidb %v31, %f0, 0 # encoding: [0xe7,0xf0,0x00,0x08,0x38,0x4a]
#CHECK: wftcidb %f4, %v21, 1656 # encoding: [0xe7,0x45,0x67,0x88,0x34,0x4a]
-
+
wftcidb %v0, %v0, 0
wftcidb %f0, %f0, 0
wftcidb %v0, %v0, 4095
@@ -6818,280 +7098,10 @@
wledb %v0, %v0, 0, 0
wledb %f0, %f0, 0, 0
- wledb %v0, %v0, 0, 15
+ wledb %v0, %v0, 0, 15
wledb %v0, %v0, 4, 0
wledb %v0, %v0, 12, 0
wledb %v0, %v31, 0, 0
wledb %v31, %v0, 0, 0
wledb %v14, %v17, 4, 10
-#CHECK: lochi %r11, 42, 0 # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x42]
-#CHECK: lochio %r11, 42 # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x42]
-#CHECK: lochih %r11, 42 # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x42]
-#CHECK: lochinle %r11, 42 # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x42]
-#CHECK: lochil %r11, -1 # encoding: [0xec,0xb4,0xff,0xff,0x00,0x42]
-#CHECK: lochinhe %r11, 42 # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x42]
-#CHECK: lochilh %r11, -1 # encoding: [0xec,0xb6,0xff,0xff,0x00,0x42]
-#CHECK: lochine %r11, 0 # encoding: [0xec,0xb7,0x00,0x00,0x00,0x42]
-#CHECK: lochie %r11, 0 # encoding: [0xec,0xb8,0x00,0x00,0x00,0x42]
-#CHECK: lochinlh %r11, 42 # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x42]
-#CHECK: lochihe %r11, 255 # encoding: [0xec,0xba,0x00,0xff,0x00,0x42]
-#CHECK: lochinl %r11, 255 # encoding: [0xec,0xbb,0x00,0xff,0x00,0x42]
-#CHECK: lochile %r11, 32767 # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x42]
-#CHECK: lochinh %r11, 32767 # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x42]
-#CHECK: lochino %r11, 32512 # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x42]
-#CHECK: lochi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x42]
-
- lochi %r11, 42, 0
- lochio %r11, 42
- lochih %r11, 42
- lochinle %r11, 42
- lochil %r11, -1
- lochinhe %r11, 42
- lochilh %r11, -1
- lochine %r11, 0
- lochie %r11, 0
- lochinlh %r11, 42
- lochihe %r11, 255
- lochinl %r11, 255
- lochile %r11, 32767
- lochinh %r11, 32767
- lochino %r11, 32512
- lochi %r11, 32512, 15
-
-#CHECK: locghi %r11, 42, 0 # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x46]
-#CHECK: locghio %r11, 42 # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x46]
-#CHECK: locghih %r11, 42 # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x46]
-#CHECK: locghinle %r11, 42 # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x46]
-#CHECK: locghil %r11, -1 # encoding: [0xec,0xb4,0xff,0xff,0x00,0x46]
-#CHECK: locghinhe %r11, 42 # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x46]
-#CHECK: locghilh %r11, -1 # encoding: [0xec,0xb6,0xff,0xff,0x00,0x46]
-#CHECK: locghine %r11, 0 # encoding: [0xec,0xb7,0x00,0x00,0x00,0x46]
-#CHECK: locghie %r11, 0 # encoding: [0xec,0xb8,0x00,0x00,0x00,0x46]
-#CHECK: locghinlh %r11, 42 # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x46]
-#CHECK: locghihe %r11, 255 # encoding: [0xec,0xba,0x00,0xff,0x00,0x46]
-#CHECK: locghinl %r11, 255 # encoding: [0xec,0xbb,0x00,0xff,0x00,0x46]
-#CHECK: locghile %r11, 32767 # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x46]
-#CHECK: locghinh %r11, 32767 # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x46]
-#CHECK: locghino %r11, 32512 # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x46]
-#CHECK: locghi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x46]
-
- locghi %r11, 42, 0
- locghio %r11, 42
- locghih %r11, 42
- locghinle %r11, 42
- locghil %r11, -1
- locghinhe %r11, 42
- locghilh %r11, -1
- locghine %r11, 0
- locghie %r11, 0
- locghinlh %r11, 42
- locghihe %r11, 255
- locghinl %r11, 255
- locghile %r11, 32767
- locghinh %r11, 32767
- locghino %r11, 32512
- locghi %r11, 32512, 15
-
-#CHECK: lochhi %r11, 42, 0 # encoding: [0xec,0xb0,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhio %r11, 42 # encoding: [0xec,0xb1,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhih %r11, 42 # encoding: [0xec,0xb2,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhinle %r11, 42 # encoding: [0xec,0xb3,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhil %r11, -1 # encoding: [0xec,0xb4,0xff,0xff,0x00,0x4e]
-#CHECK: lochhinhe %r11, 42 # encoding: [0xec,0xb5,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhilh %r11, -1 # encoding: [0xec,0xb6,0xff,0xff,0x00,0x4e]
-#CHECK: lochhine %r11, 0 # encoding: [0xec,0xb7,0x00,0x00,0x00,0x4e]
-#CHECK: lochhie %r11, 0 # encoding: [0xec,0xb8,0x00,0x00,0x00,0x4e]
-#CHECK: lochhinlh %r11, 42 # encoding: [0xec,0xb9,0x00,0x2a,0x00,0x4e]
-#CHECK: lochhihe %r11, 255 # encoding: [0xec,0xba,0x00,0xff,0x00,0x4e]
-#CHECK: lochhinl %r11, 255 # encoding: [0xec,0xbb,0x00,0xff,0x00,0x4e]
-#CHECK: lochhile %r11, 32767 # encoding: [0xec,0xbc,0x7f,0xff,0x00,0x4e]
-#CHECK: lochhinh %r11, 32767 # encoding: [0xec,0xbd,0x7f,0xff,0x00,0x4e]
-#CHECK: lochhino %r11, 32512 # encoding: [0xec,0xbe,0x7f,0x00,0x00,0x4e]
-#CHECK: lochhi %r11, 32512, 15 # encoding: [0xec,0xbf,0x7f,0x00,0x00,0x4e]
-
- lochhi %r11, 42, 0
- lochhio %r11, 42
- lochhih %r11, 42
- lochhinle %r11, 42
- lochhil %r11, -1
- lochhinhe %r11, 42
- lochhilh %r11, -1
- lochhine %r11, 0
- lochhie %r11, 0
- lochhinlh %r11, 42
- lochhihe %r11, 255
- lochhinl %r11, 255
- lochhile %r11, 32767
- lochhinh %r11, 32767
- lochhino %r11, 32512
- lochhi %r11, 32512, 15
-
-#CHECK: locfh %r0, 0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0xe0]
-#CHECK: locfh %r0, 0, 15 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xe0]
-#CHECK: locfh %r0, -524288, 0 # encoding: [0xeb,0x00,0x00,0x00,0x80,0xe0]
-#CHECK: locfh %r0, 524287, 0 # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0xe0]
-#CHECK: locfh %r0, 0(%r1), 0 # encoding: [0xeb,0x00,0x10,0x00,0x00,0xe0]
-#CHECK: locfh %r0, 0(%r15), 0 # encoding: [0xeb,0x00,0xf0,0x00,0x00,0xe0]
-#CHECK: locfh %r15, 0, 0 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0xe0]
-#CHECK: locfh %r1, 4095(%r2), 3 # encoding: [0xeb,0x13,0x2f,0xff,0x00,0xe0]
-
- locfh %r0,0,0
- locfh %r0,0,15
- locfh %r0,-524288,0
- locfh %r0,524287,0
- locfh %r0,0(%r1),0
- locfh %r0,0(%r15),0
- locfh %r15,0,0
- locfh %r1,4095(%r2),3
-
-#CHECK: locfho %r1, 2(%r3) # encoding: [0xeb,0x11,0x30,0x02,0x00,0xe0]
-#CHECK: locfhh %r1, 2(%r3) # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe0]
-#CHECK: locfhp %r1, 2(%r3) # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnle %r1, 2(%r3) # encoding: [0xeb,0x13,0x30,0x02,0x00,0xe0]
-#CHECK: locfhl %r1, 2(%r3) # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe0]
-#CHECK: locfhm %r1, 2(%r3) # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnhe %r1, 2(%r3) # encoding: [0xeb,0x15,0x30,0x02,0x00,0xe0]
-#CHECK: locfhlh %r1, 2(%r3) # encoding: [0xeb,0x16,0x30,0x02,0x00,0xe0]
-#CHECK: locfhne %r1, 2(%r3) # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnz %r1, 2(%r3) # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe0]
-#CHECK: locfhe %r1, 2(%r3) # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe0]
-#CHECK: locfhz %r1, 2(%r3) # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnlh %r1, 2(%r3) # encoding: [0xeb,0x19,0x30,0x02,0x00,0xe0]
-#CHECK: locfhhe %r1, 2(%r3) # encoding: [0xeb,0x1a,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnl %r1, 2(%r3) # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnm %r1, 2(%r3) # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe0]
-#CHECK: locfhle %r1, 2(%r3) # encoding: [0xeb,0x1c,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnh %r1, 2(%r3) # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe0]
-#CHECK: locfhnp %r1, 2(%r3) # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe0]
-#CHECK: locfhno %r1, 2(%r3) # encoding: [0xeb,0x1e,0x30,0x02,0x00,0xe0]
-
- locfho %r1,2(%r3)
- locfhh %r1,2(%r3)
- locfhp %r1,2(%r3)
- locfhnle %r1,2(%r3)
- locfhl %r1,2(%r3)
- locfhm %r1,2(%r3)
- locfhnhe %r1,2(%r3)
- locfhlh %r1,2(%r3)
- locfhne %r1,2(%r3)
- locfhnz %r1,2(%r3)
- locfhe %r1,2(%r3)
- locfhz %r1,2(%r3)
- locfhnlh %r1,2(%r3)
- locfhhe %r1,2(%r3)
- locfhnl %r1,2(%r3)
- locfhnm %r1,2(%r3)
- locfhle %r1,2(%r3)
- locfhnh %r1,2(%r3)
- locfhnp %r1,2(%r3)
- locfhno %r1,2(%r3)
-
-#CHECK: locfhr %r1, %r2, 0 # encoding: [0xb9,0xe0,0x00,0x12]
-#CHECK: locfhr %r1, %r2, 15 # encoding: [0xb9,0xe0,0xf0,0x12]
-
- locfhr %r1,%r2,0
- locfhr %r1,%r2,15
-
-#CHECK: locfhro %r1, %r3 # encoding: [0xb9,0xe0,0x10,0x13]
-#CHECK: locfhrh %r1, %r3 # encoding: [0xb9,0xe0,0x20,0x13]
-#CHECK: locfhrp %r1, %r3 # encoding: [0xb9,0xe0,0x20,0x13]
-#CHECK: locfhrnle %r1, %r3 # encoding: [0xb9,0xe0,0x30,0x13]
-#CHECK: locfhrl %r1, %r3 # encoding: [0xb9,0xe0,0x40,0x13]
-#CHECK: locfhrm %r1, %r3 # encoding: [0xb9,0xe0,0x40,0x13]
-#CHECK: locfhrnhe %r1, %r3 # encoding: [0xb9,0xe0,0x50,0x13]
-#CHECK: locfhrlh %r1, %r3 # encoding: [0xb9,0xe0,0x60,0x13]
-#CHECK: locfhrne %r1, %r3 # encoding: [0xb9,0xe0,0x70,0x13]
-#CHECK: locfhrnz %r1, %r3 # encoding: [0xb9,0xe0,0x70,0x13]
-#CHECK: locfhre %r1, %r3 # encoding: [0xb9,0xe0,0x80,0x13]
-#CHECK: locfhrz %r1, %r3 # encoding: [0xb9,0xe0,0x80,0x13]
-#CHECK: locfhrnlh %r1, %r3 # encoding: [0xb9,0xe0,0x90,0x13]
-#CHECK: locfhrhe %r1, %r3 # encoding: [0xb9,0xe0,0xa0,0x13]
-#CHECK: locfhrnl %r1, %r3 # encoding: [0xb9,0xe0,0xb0,0x13]
-#CHECK: locfhrnm %r1, %r3 # encoding: [0xb9,0xe0,0xb0,0x13]
-#CHECK: locfhrle %r1, %r3 # encoding: [0xb9,0xe0,0xc0,0x13]
-#CHECK: locfhrnh %r1, %r3 # encoding: [0xb9,0xe0,0xd0,0x13]
-#CHECK: locfhrnp %r1, %r3 # encoding: [0xb9,0xe0,0xd0,0x13]
-#CHECK: locfhrno %r1, %r3 # encoding: [0xb9,0xe0,0xe0,0x13]
-
- locfhro %r1,%r3
- locfhrh %r1,%r3
- locfhrp %r1,%r3
- locfhrnle %r1,%r3
- locfhrl %r1,%r3
- locfhrm %r1,%r3
- locfhrnhe %r1,%r3
- locfhrlh %r1,%r3
- locfhrne %r1,%r3
- locfhrnz %r1,%r3
- locfhre %r1,%r3
- locfhrz %r1,%r3
- locfhrnlh %r1,%r3
- locfhrhe %r1,%r3
- locfhrnl %r1,%r3
- locfhrnm %r1,%r3
- locfhrle %r1,%r3
- locfhrnh %r1,%r3
- locfhrnp %r1,%r3
- locfhrno %r1,%r3
-
-#CHECK: stocfh %r0, 0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0xe1]
-#CHECK: stocfh %r0, 0, 15 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xe1]
-#CHECK: stocfh %r0, -524288, 0 # encoding: [0xeb,0x00,0x00,0x00,0x80,0xe1]
-#CHECK: stocfh %r0, 524287, 0 # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0xe1]
-#CHECK: stocfh %r0, 0(%r1), 0 # encoding: [0xeb,0x00,0x10,0x00,0x00,0xe1]
-#CHECK: stocfh %r0, 0(%r15), 0 # encoding: [0xeb,0x00,0xf0,0x00,0x00,0xe1]
-#CHECK: stocfh %r15, 0, 0 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0xe1]
-#CHECK: stocfh %r1, 4095(%r2), 3 # encoding: [0xeb,0x13,0x2f,0xff,0x00,0xe1]
-
- stocfh %r0,0,0
- stocfh %r0,0,15
- stocfh %r0,-524288,0
- stocfh %r0,524287,0
- stocfh %r0,0(%r1),0
- stocfh %r0,0(%r15),0
- stocfh %r15,0,0
- stocfh %r1,4095(%r2),3
-
-#CHECK: stocfho %r1, 2(%r3) # encoding: [0xeb,0x11,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhh %r1, 2(%r3) # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhp %r1, 2(%r3) # encoding: [0xeb,0x12,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnle %r1, 2(%r3) # encoding: [0xeb,0x13,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhl %r1, 2(%r3) # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhm %r1, 2(%r3) # encoding: [0xeb,0x14,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnhe %r1, 2(%r3) # encoding: [0xeb,0x15,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhlh %r1, 2(%r3) # encoding: [0xeb,0x16,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhne %r1, 2(%r3) # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnz %r1, 2(%r3) # encoding: [0xeb,0x17,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhe %r1, 2(%r3) # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhz %r1, 2(%r3) # encoding: [0xeb,0x18,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnlh %r1, 2(%r3) # encoding: [0xeb,0x19,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhhe %r1, 2(%r3) # encoding: [0xeb,0x1a,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnl %r1, 2(%r3) # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnm %r1, 2(%r3) # encoding: [0xeb,0x1b,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhle %r1, 2(%r3) # encoding: [0xeb,0x1c,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnh %r1, 2(%r3) # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhnp %r1, 2(%r3) # encoding: [0xeb,0x1d,0x30,0x02,0x00,0xe1]
-#CHECK: stocfhno %r1, 2(%r3) # encoding: [0xeb,0x1e,0x30,0x02,0x00,0xe1]
-
- stocfho %r1,2(%r3)
- stocfhh %r1,2(%r3)
- stocfhp %r1,2(%r3)
- stocfhnle %r1,2(%r3)
- stocfhl %r1,2(%r3)
- stocfhm %r1,2(%r3)
- stocfhnhe %r1,2(%r3)
- stocfhlh %r1,2(%r3)
- stocfhne %r1,2(%r3)
- stocfhnz %r1,2(%r3)
- stocfhe %r1,2(%r3)
- stocfhz %r1,2(%r3)
- stocfhnlh %r1,2(%r3)
- stocfhhe %r1,2(%r3)
- stocfhnl %r1,2(%r3)
- stocfhnm %r1,2(%r3)
- stocfhle %r1,2(%r3)
- stocfhnh %r1,2(%r3)
- stocfhnp %r1,2(%r3)
- stocfhno %r1,2(%r3)
-
diff --git a/test/MC/SystemZ/insn-good-z196.s b/test/MC/SystemZ/insn-good-z196.s
index b24cc7d18e1f..02c473c11a4a 100644
--- a/test/MC/SystemZ/insn-good-z196.s
+++ b/test/MC/SystemZ/insn-good-z196.s
@@ -136,34 +136,6 @@
ark %r15,%r0,%r0
ark %r7,%r8,%r9
-#CHECK: cdfbra %f0, 0, %r0, 0 # encoding: [0xb3,0x95,0x00,0x00]
-#CHECK: cdfbra %f0, 0, %r0, 15 # encoding: [0xb3,0x95,0x0f,0x00]
-#CHECK: cdfbra %f0, 0, %r15, 0 # encoding: [0xb3,0x95,0x00,0x0f]
-#CHECK: cdfbra %f0, 15, %r0, 0 # encoding: [0xb3,0x95,0xf0,0x00]
-#CHECK: cdfbra %f4, 5, %r6, 7 # encoding: [0xb3,0x95,0x57,0x46]
-#CHECK: cdfbra %f15, 0, %r0, 0 # encoding: [0xb3,0x95,0x00,0xf0]
-
- cdfbra %f0, 0, %r0, 0
- cdfbra %f0, 0, %r0, 15
- cdfbra %f0, 0, %r15, 0
- cdfbra %f0, 15, %r0, 0
- cdfbra %f4, 5, %r6, 7
- cdfbra %f15, 0, %r0, 0
-
-#CHECK: cdgbra %f0, 0, %r0, 0 # encoding: [0xb3,0xa5,0x00,0x00]
-#CHECK: cdgbra %f0, 0, %r0, 15 # encoding: [0xb3,0xa5,0x0f,0x00]
-#CHECK: cdgbra %f0, 0, %r15, 0 # encoding: [0xb3,0xa5,0x00,0x0f]
-#CHECK: cdgbra %f0, 15, %r0, 0 # encoding: [0xb3,0xa5,0xf0,0x00]
-#CHECK: cdgbra %f4, 5, %r6, 7 # encoding: [0xb3,0xa5,0x57,0x46]
-#CHECK: cdgbra %f15, 0, %r0, 0 # encoding: [0xb3,0xa5,0x00,0xf0]
-
- cdgbra %f0, 0, %r0, 0
- cdgbra %f0, 0, %r0, 15
- cdgbra %f0, 0, %r15, 0
- cdgbra %f0, 15, %r0, 0
- cdgbra %f4, 5, %r6, 7
- cdgbra %f15, 0, %r0, 0
-
#CHECK: brcth %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xcc,0x06,A,A,A,A]
#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL
brcth %r0, -0x100000000
@@ -201,6 +173,34 @@
brcth %r7,frob@PLT
brcth %r8,frob@PLT
+#CHECK: cdfbra %f0, 0, %r0, 0 # encoding: [0xb3,0x95,0x00,0x00]
+#CHECK: cdfbra %f0, 0, %r0, 15 # encoding: [0xb3,0x95,0x0f,0x00]
+#CHECK: cdfbra %f0, 0, %r15, 0 # encoding: [0xb3,0x95,0x00,0x0f]
+#CHECK: cdfbra %f0, 15, %r0, 0 # encoding: [0xb3,0x95,0xf0,0x00]
+#CHECK: cdfbra %f4, 5, %r6, 7 # encoding: [0xb3,0x95,0x57,0x46]
+#CHECK: cdfbra %f15, 0, %r0, 0 # encoding: [0xb3,0x95,0x00,0xf0]
+
+ cdfbra %f0, 0, %r0, 0
+ cdfbra %f0, 0, %r0, 15
+ cdfbra %f0, 0, %r15, 0
+ cdfbra %f0, 15, %r0, 0
+ cdfbra %f4, 5, %r6, 7
+ cdfbra %f15, 0, %r0, 0
+
+#CHECK: cdgbra %f0, 0, %r0, 0 # encoding: [0xb3,0xa5,0x00,0x00]
+#CHECK: cdgbra %f0, 0, %r0, 15 # encoding: [0xb3,0xa5,0x0f,0x00]
+#CHECK: cdgbra %f0, 0, %r15, 0 # encoding: [0xb3,0xa5,0x00,0x0f]
+#CHECK: cdgbra %f0, 15, %r0, 0 # encoding: [0xb3,0xa5,0xf0,0x00]
+#CHECK: cdgbra %f4, 5, %r6, 7 # encoding: [0xb3,0xa5,0x57,0x46]
+#CHECK: cdgbra %f15, 0, %r0, 0 # encoding: [0xb3,0xa5,0x00,0xf0]
+
+ cdgbra %f0, 0, %r0, 0
+ cdgbra %f0, 0, %r0, 15
+ cdgbra %f0, 0, %r15, 0
+ cdgbra %f0, 15, %r0, 0
+ cdgbra %f4, 5, %r6, 7
+ cdgbra %f15, 0, %r0, 0
+
#CHECK: cdlfbr %f0, 0, %r0, 0 # encoding: [0xb3,0x91,0x00,0x00]
#CHECK: cdlfbr %f0, 0, %r0, 15 # encoding: [0xb3,0x91,0x0f,0x00]
#CHECK: cdlfbr %f0, 0, %r15, 0 # encoding: [0xb3,0x91,0x00,0x0f]
@@ -619,6 +619,36 @@
fixbra %f4, 5, %f8, 9
fixbra %f13, 0, %f0, 0
+#CHECK: kmctr %r2, %r2, %r2 # encoding: [0xb9,0x2d,0x20,0x22]
+#CHECK: kmctr %r2, %r8, %r14 # encoding: [0xb9,0x2d,0x80,0x2e]
+#CHECK: kmctr %r14, %r8, %r2 # encoding: [0xb9,0x2d,0x80,0xe2]
+#CHECK: kmctr %r6, %r8, %r10 # encoding: [0xb9,0x2d,0x80,0x6a]
+
+ kmctr %r2, %r2, %r2
+ kmctr %r2, %r8, %r14
+ kmctr %r14, %r8, %r2
+ kmctr %r6, %r8, %r10
+
+#CHECK: kmf %r2, %r2 # encoding: [0xb9,0x2a,0x00,0x22]
+#CHECK: kmf %r2, %r14 # encoding: [0xb9,0x2a,0x00,0x2e]
+#CHECK: kmf %r14, %r2 # encoding: [0xb9,0x2a,0x00,0xe2]
+#CHECK: kmf %r6, %r10 # encoding: [0xb9,0x2a,0x00,0x6a]
+
+ kmf %r2, %r2
+ kmf %r2, %r14
+ kmf %r14, %r2
+ kmf %r6, %r10
+
+#CHECK: kmo %r2, %r2 # encoding: [0xb9,0x2b,0x00,0x22]
+#CHECK: kmo %r2, %r14 # encoding: [0xb9,0x2b,0x00,0x2e]
+#CHECK: kmo %r14, %r2 # encoding: [0xb9,0x2b,0x00,0xe2]
+#CHECK: kmo %r6, %r10 # encoding: [0xb9,0x2b,0x00,0x6a]
+
+ kmo %r2, %r2
+ kmo %r2, %r14
+ kmo %r14, %r2
+ kmo %r6, %r10
+
#CHECK: laa %r0, %r0, -524288 # encoding: [0xeb,0x00,0x00,0x00,0x80,0xf8]
#CHECK: laa %r0, %r0, -1 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0xf8]
#CHECK: laa %r0, %r0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0xf8]
@@ -1303,6 +1333,10 @@
ork %r15,%r0,%r0
ork %r7,%r8,%r9
+#CHECK: pcc # encoding: [0xb9,0x2c,0x00,0x00]
+
+ pcc
+
#CHECK: popcnt %r0, %r0 # encoding: [0xb9,0xe1,0x00,0x00]
#CHECK: popcnt %r0, %r15 # encoding: [0xb9,0xe1,0x00,0x0f]
#CHECK: popcnt %r15, %r0 # encoding: [0xb9,0xe1,0x00,0xf0]
@@ -1395,18 +1429,6 @@
slgrk %r15,%r0,%r0
slgrk %r7,%r8,%r9
-#CHECK: slrk %r0, %r0, %r0 # encoding: [0xb9,0xfb,0x00,0x00]
-#CHECK: slrk %r0, %r0, %r15 # encoding: [0xb9,0xfb,0xf0,0x00]
-#CHECK: slrk %r0, %r15, %r0 # encoding: [0xb9,0xfb,0x00,0x0f]
-#CHECK: slrk %r15, %r0, %r0 # encoding: [0xb9,0xfb,0x00,0xf0]
-#CHECK: slrk %r7, %r8, %r9 # encoding: [0xb9,0xfb,0x90,0x78]
-
- slrk %r0,%r0,%r0
- slrk %r0,%r0,%r15
- slrk %r0,%r15,%r0
- slrk %r15,%r0,%r0
- slrk %r7,%r8,%r9
-
#CHECK: sllk %r0, %r0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0xdf]
#CHECK: sllk %r15, %r1, 0 # encoding: [0xeb,0xf1,0x00,0x00,0x00,0xdf]
#CHECK: sllk %r1, %r15, 0 # encoding: [0xeb,0x1f,0x00,0x00,0x00,0xdf]
@@ -1433,6 +1455,18 @@
sllk %r0,%r0,524287(%r1)
sllk %r0,%r0,524287(%r15)
+#CHECK: slrk %r0, %r0, %r0 # encoding: [0xb9,0xfb,0x00,0x00]
+#CHECK: slrk %r0, %r0, %r15 # encoding: [0xb9,0xfb,0xf0,0x00]
+#CHECK: slrk %r0, %r15, %r0 # encoding: [0xb9,0xfb,0x00,0x0f]
+#CHECK: slrk %r15, %r0, %r0 # encoding: [0xb9,0xfb,0x00,0xf0]
+#CHECK: slrk %r7, %r8, %r9 # encoding: [0xb9,0xfb,0x90,0x78]
+
+ slrk %r0,%r0,%r0
+ slrk %r0,%r0,%r15
+ slrk %r0,%r15,%r0
+ slrk %r15,%r0,%r0
+ slrk %r7,%r8,%r9
+
#CHECK: srak %r0, %r0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0xdc]
#CHECK: srak %r15, %r1, 0 # encoding: [0xeb,0xf1,0x00,0x00,0x00,0xdc]
#CHECK: srak %r1, %r15, 0 # encoding: [0xeb,0x1f,0x00,0x00,0x00,0xdc]
@@ -1533,28 +1567,6 @@
stch %r0, 524287(%r15,%r1)
stch %r15, 0
-#CHECK: sthh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0xc7]
-#CHECK: sthh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xc7]
-#CHECK: sthh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0xc7]
-#CHECK: sthh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0xc7]
-#CHECK: sthh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xc7]
-#CHECK: sthh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0xc7]
-#CHECK: sthh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xc7]
-#CHECK: sthh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xc7]
-#CHECK: sthh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xc7]
-#CHECK: sthh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xc7]
-
- sthh %r0, -524288
- sthh %r0, -1
- sthh %r0, 0
- sthh %r0, 1
- sthh %r0, 524287
- sthh %r0, 0(%r1)
- sthh %r0, 0(%r15)
- sthh %r0, 524287(%r1,%r15)
- sthh %r0, 524287(%r15,%r1)
- sthh %r15, 0
-
#CHECK: stfh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0xcb]
#CHECK: stfh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xcb]
#CHECK: stfh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0xcb]
@@ -1577,6 +1589,28 @@
stfh %r0, 524287(%r15,%r1)
stfh %r15, 0
+#CHECK: sthh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0xc7]
+#CHECK: sthh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xc7]
+#CHECK: sthh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0xc7]
+#CHECK: sthh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0xc7]
+#CHECK: sthh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xc7]
+#CHECK: sthh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0xc7]
+#CHECK: sthh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xc7]
+#CHECK: sthh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xc7]
+#CHECK: sthh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xc7]
+#CHECK: sthh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xc7]
+
+ sthh %r0, -524288
+ sthh %r0, -1
+ sthh %r0, 0
+ sthh %r0, 1
+ sthh %r0, 524287
+ sthh %r0, 0(%r1)
+ sthh %r0, 0(%r15)
+ sthh %r0, 524287(%r1,%r15)
+ sthh %r0, 524287(%r15,%r1)
+ sthh %r15, 0
+
#CHECK: stoc %r0, 0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0xf3]
#CHECK: stoc %r0, 0, 15 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xf3]
#CHECK: stoc %r0, -524288, 0 # encoding: [0xeb,0x00,0x00,0x00,0x80,0xf3]
diff --git a/test/MC/SystemZ/insn-good-zEC12.s b/test/MC/SystemZ/insn-good-zEC12.s
index bdaeef95eef1..275d68d8a619 100644
--- a/test/MC/SystemZ/insn-good-zEC12.s
+++ b/test/MC/SystemZ/insn-good-zEC12.s
@@ -178,6 +178,14 @@
clgtnl %r0, 0(%r15)
clgtnh %r0, 0(%r15)
+#CHECK: etnd %r0 # encoding: [0xb2,0xec,0x00,0x00]
+#CHECK: etnd %r15 # encoding: [0xb2,0xec,0x00,0xf0]
+#CHECK: etnd %r7 # encoding: [0xb2,0xec,0x00,0x70]
+
+ etnd %r0
+ etnd %r15
+ etnd %r7
+
#CHECK: lat %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x9f]
#CHECK: lat %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x9f]
#CHECK: lat %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x9f]
@@ -288,14 +296,6 @@
llgtat %r0, 524287(%r15,%r1)
llgtat %r15, 0
-#CHECK: etnd %r0 # encoding: [0xb2,0xec,0x00,0x00]
-#CHECK: etnd %r15 # encoding: [0xb2,0xec,0x00,0xf0]
-#CHECK: etnd %r7 # encoding: [0xb2,0xec,0x00,0x70]
-
- etnd %r0
- etnd %r15
- etnd %r7
-
#CHECK: niai 0, 0 # encoding: [0xb2,0xfa,0x00,0x00]
#CHECK: niai 15, 0 # encoding: [0xb2,0xfa,0x00,0xf0]
#CHECK: niai 0, 15 # encoding: [0xb2,0xfa,0x00,0x0f]
diff --git a/test/MC/SystemZ/insn-good.s b/test/MC/SystemZ/insn-good.s
index f4dddc4712d5..a6228f23c8f8 100644
--- a/test/MC/SystemZ/insn-good.s
+++ b/test/MC/SystemZ/insn-good.s
@@ -415,6 +415,34 @@
algr %r15,%r0
algr %r7,%r8
+#CHECK: algsi -524288, 0 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x7e]
+#CHECK: algsi -1, 0 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x7e]
+#CHECK: algsi 0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0x7e]
+#CHECK: algsi 1, 0 # encoding: [0xeb,0x00,0x00,0x01,0x00,0x7e]
+#CHECK: algsi 524287, 0 # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0x7e]
+#CHECK: algsi 0, -128 # encoding: [0xeb,0x80,0x00,0x00,0x00,0x7e]
+#CHECK: algsi 0, -1 # encoding: [0xeb,0xff,0x00,0x00,0x00,0x7e]
+#CHECK: algsi 0, 1 # encoding: [0xeb,0x01,0x00,0x00,0x00,0x7e]
+#CHECK: algsi 0, 127 # encoding: [0xeb,0x7f,0x00,0x00,0x00,0x7e]
+#CHECK: algsi 0(%r1), 42 # encoding: [0xeb,0x2a,0x10,0x00,0x00,0x7e]
+#CHECK: algsi 0(%r15), 42 # encoding: [0xeb,0x2a,0xf0,0x00,0x00,0x7e]
+#CHECK: algsi 524287(%r1), 42 # encoding: [0xeb,0x2a,0x1f,0xff,0x7f,0x7e]
+#CHECK: algsi 524287(%r15), 42 # encoding: [0xeb,0x2a,0xff,0xff,0x7f,0x7e]
+
+ algsi -524288, 0
+ algsi -1, 0
+ algsi 0, 0
+ algsi 1, 0
+ algsi 524287, 0
+ algsi 0, -128
+ algsi 0, -1
+ algsi 0, 1
+ algsi 0, 127
+ algsi 0(%r1), 42
+ algsi 0(%r15), 42
+ algsi 524287(%r1), 42
+ algsi 524287(%r15), 42
+
#CHECK: alr %r0, %r0 # encoding: [0x1e,0x00]
#CHECK: alr %r0, %r15 # encoding: [0x1e,0x0f]
#CHECK: alr %r15, %r0 # encoding: [0x1e,0xf0]
@@ -425,6 +453,34 @@
alr %r15,%r0
alr %r7,%r8
+#CHECK: alsi -524288, 0 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x6e]
+#CHECK: alsi -1, 0 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x6e]
+#CHECK: alsi 0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0x6e]
+#CHECK: alsi 1, 0 # encoding: [0xeb,0x00,0x00,0x01,0x00,0x6e]
+#CHECK: alsi 524287, 0 # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0x6e]
+#CHECK: alsi 0, -128 # encoding: [0xeb,0x80,0x00,0x00,0x00,0x6e]
+#CHECK: alsi 0, -1 # encoding: [0xeb,0xff,0x00,0x00,0x00,0x6e]
+#CHECK: alsi 0, 1 # encoding: [0xeb,0x01,0x00,0x00,0x00,0x6e]
+#CHECK: alsi 0, 127 # encoding: [0xeb,0x7f,0x00,0x00,0x00,0x6e]
+#CHECK: alsi 0(%r1), 42 # encoding: [0xeb,0x2a,0x10,0x00,0x00,0x6e]
+#CHECK: alsi 0(%r15), 42 # encoding: [0xeb,0x2a,0xf0,0x00,0x00,0x6e]
+#CHECK: alsi 524287(%r1), 42 # encoding: [0xeb,0x2a,0x1f,0xff,0x7f,0x6e]
+#CHECK: alsi 524287(%r15), 42 # encoding: [0xeb,0x2a,0xff,0xff,0x7f,0x6e]
+
+ alsi -524288, 0
+ alsi -1, 0
+ alsi 0, 0
+ alsi 1, 0
+ alsi 524287, 0
+ alsi 0, -128
+ alsi 0, -1
+ alsi 0, 1
+ alsi 0, 127
+ alsi 0(%r1), 42
+ alsi 0(%r15), 42
+ alsi 524287(%r1), 42
+ alsi 524287(%r15), 42
+
#CHECK: aly %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x5e]
#CHECK: aly %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x5e]
#CHECK: aly %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x5e]
@@ -447,6 +503,36 @@
aly %r0, 524287(%r15,%r1)
aly %r15, 0
+#CHECK: ap 0(1), 0(1) # encoding: [0xfa,0x00,0x00,0x00,0x00,0x00]
+#CHECK: ap 0(1), 0(1,%r1) # encoding: [0xfa,0x00,0x00,0x00,0x10,0x00]
+#CHECK: ap 0(1), 0(1,%r15) # encoding: [0xfa,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: ap 0(1), 4095(1) # encoding: [0xfa,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: ap 0(1), 4095(1,%r1) # encoding: [0xfa,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: ap 0(1), 4095(1,%r15) # encoding: [0xfa,0x00,0x00,0x00,0xff,0xff]
+#CHECK: ap 0(1,%r1), 0(1) # encoding: [0xfa,0x00,0x10,0x00,0x00,0x00]
+#CHECK: ap 0(1,%r15), 0(1) # encoding: [0xfa,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: ap 4095(1,%r1), 0(1) # encoding: [0xfa,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: ap 4095(1,%r15), 0(1) # encoding: [0xfa,0x00,0xff,0xff,0x00,0x00]
+#CHECK: ap 0(16,%r1), 0(1) # encoding: [0xfa,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: ap 0(16,%r15), 0(1) # encoding: [0xfa,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: ap 0(1), 0(16,%r1) # encoding: [0xfa,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: ap 0(1), 0(16,%r15) # encoding: [0xfa,0x0f,0x00,0x00,0xf0,0x00]
+
+ ap 0(1), 0(1)
+ ap 0(1), 0(1,%r1)
+ ap 0(1), 0(1,%r15)
+ ap 0(1), 4095(1)
+ ap 0(1), 4095(1,%r1)
+ ap 0(1), 4095(1,%r15)
+ ap 0(1,%r1), 0(1)
+ ap 0(1,%r15), 0(1)
+ ap 4095(1,%r1), 0(1)
+ ap 4095(1,%r15), 0(1)
+ ap 0(16,%r1), 0(1)
+ ap 0(16,%r15), 0(1)
+ ap 0(1), 0(16,%r1)
+ ap 0(1), 0(16,%r15)
+
#CHECK: ar %r0, %r0 # encoding: [0x1a,0x00]
#CHECK: ar %r0, %r15 # encoding: [0x1a,0x0f]
#CHECK: ar %r15, %r0 # encoding: [0x1a,0xf0]
@@ -1965,6 +2051,20 @@
cegbr %f7, %r8
cegbr %f15, %r15
+#CHECK: cfc 0 # encoding: [0xb2,0x1a,0x00,0x00]
+#CHECK: cfc 0(%r1) # encoding: [0xb2,0x1a,0x10,0x00]
+#CHECK: cfc 0(%r15) # encoding: [0xb2,0x1a,0xf0,0x00]
+#CHECK: cfc 4095 # encoding: [0xb2,0x1a,0x0f,0xff]
+#CHECK: cfc 4095(%r1) # encoding: [0xb2,0x1a,0x1f,0xff]
+#CHECK: cfc 4095(%r15) # encoding: [0xb2,0x1a,0xff,0xff]
+
+ cfc 0
+ cfc 0(%r1)
+ cfc 0(%r15)
+ cfc 4095
+ cfc 4095(%r1)
+ cfc 4095(%r15)
+
#CHECK: cfdbr %r0, 0, %f0 # encoding: [0xb3,0x99,0x00,0x00]
#CHECK: cfdbr %r0, 0, %f15 # encoding: [0xb3,0x99,0x00,0x0f]
#CHECK: cfdbr %r0, 15, %f0 # encoding: [0xb3,0x99,0xf0,0x00]
@@ -3435,6 +3535,16 @@
citnl %r15, 1
citnh %r15, 1
+#CHECK: cksm %r0, %r8 # encoding: [0xb2,0x41,0x00,0x08]
+#CHECK: cksm %r0, %r14 # encoding: [0xb2,0x41,0x00,0x0e]
+#CHECK: cksm %r15, %r0 # encoding: [0xb2,0x41,0x00,0xf0]
+#CHECK: cksm %r15, %r8 # encoding: [0xb2,0x41,0x00,0xf8]
+
+ cksm %r0, %r8
+ cksm %r0, %r14
+ cksm %r15, %r0
+ cksm %r15, %r8
+
#CHECK: cl %r0, 0 # encoding: [0x55,0x00,0x00,0x00]
#CHECK: cl %r0, 4095 # encoding: [0x55,0x00,0x0f,0xff]
#CHECK: cl %r0, 0(%r1) # encoding: [0x55,0x00,0x10,0x00]
@@ -3477,6 +3587,54 @@
clc 0(256,%r1), 0
clc 0(256,%r15), 0
+#CHECK: clcl %r0, %r8 # encoding: [0x0f,0x08]
+#CHECK: clcl %r0, %r14 # encoding: [0x0f,0x0e]
+#CHECK: clcl %r14, %r0 # encoding: [0x0f,0xe0]
+#CHECK: clcl %r14, %r8 # encoding: [0x0f,0xe8]
+
+ clcl %r0, %r8
+ clcl %r0, %r14
+ clcl %r14, %r0
+ clcl %r14, %r8
+
+#CHECK: clcle %r0, %r0, 0 # encoding: [0xa9,0x00,0x00,0x00]
+#CHECK: clcle %r0, %r14, 4095 # encoding: [0xa9,0x0e,0x0f,0xff]
+#CHECK: clcle %r0, %r0, 0(%r1) # encoding: [0xa9,0x00,0x10,0x00]
+#CHECK: clcle %r0, %r0, 0(%r15) # encoding: [0xa9,0x00,0xf0,0x00]
+#CHECK: clcle %r14, %r14, 4095(%r1) # encoding: [0xa9,0xee,0x1f,0xff]
+#CHECK: clcle %r0, %r0, 4095(%r15) # encoding: [0xa9,0x00,0xff,0xff]
+#CHECK: clcle %r14, %r0, 0 # encoding: [0xa9,0xe0,0x00,0x00]
+
+ clcle %r0, %r0, 0
+ clcle %r0, %r14, 4095
+ clcle %r0, %r0, 0(%r1)
+ clcle %r0, %r0, 0(%r15)
+ clcle %r14, %r14, 4095(%r1)
+ clcle %r0, %r0, 4095(%r15)
+ clcle %r14, %r0, 0
+
+#CHECK: clclu %r0, %r0, -524288 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x8f]
+#CHECK: clclu %r0, %r0, -1 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x8f]
+#CHECK: clclu %r0, %r14, 0 # encoding: [0xeb,0x0e,0x00,0x00,0x00,0x8f]
+#CHECK: clclu %r0, %r14, 1 # encoding: [0xeb,0x0e,0x00,0x01,0x00,0x8f]
+#CHECK: clclu %r0, %r8, 524287 # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x8f]
+#CHECK: clclu %r0, %r8, 0(%r1) # encoding: [0xeb,0x08,0x10,0x00,0x00,0x8f]
+#CHECK: clclu %r0, %r4, 0(%r15) # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x8f]
+#CHECK: clclu %r0, %r4, 524287(%r15) # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x8f]
+#CHECK: clclu %r0, %r0, 524287(%r1) # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x8f]
+#CHECK: clclu %r14, %r0, 0 # encoding: [0xeb,0xe0,0x00,0x00,0x00,0x8f]
+
+ clclu %r0, %r0, -524288
+ clclu %r0, %r0, -1
+ clclu %r0, %r14, 0
+ clclu %r0, %r14, 1
+ clclu %r0, %r8, 524287
+ clclu %r0, %r8, 0(%r1)
+ clclu %r0, %r4, 0(%r15)
+ clclu %r0, %r4, 524287(%r15)
+ clclu %r0, %r0, 524287(%r1)
+ clclu %r14, %r0, 0
+
#CHECK: clfhsi 0, 0 # encoding: [0xe5,0x5d,0x00,0x00,0x00,0x00]
#CHECK: clfhsi 4095, 0 # encoding: [0xe5,0x5d,0x0f,0xff,0x00,0x00]
#CHECK: clfhsi 0, 65535 # encoding: [0xe5,0x5d,0x00,0x00,0xff,0xff]
@@ -4751,6 +4909,66 @@
cliy 524287(%r1), 42
cliy 524287(%r15), 42
+#CHECK: clm %r0, 0, 0 # encoding: [0xbd,0x00,0x00,0x00]
+#CHECK: clm %r0, 15, 4095 # encoding: [0xbd,0x0f,0x0f,0xff]
+#CHECK: clm %r0, 0, 0(%r1) # encoding: [0xbd,0x00,0x10,0x00]
+#CHECK: clm %r0, 0, 0(%r15) # encoding: [0xbd,0x00,0xf0,0x00]
+#CHECK: clm %r15, 15, 4095(%r1) # encoding: [0xbd,0xff,0x1f,0xff]
+#CHECK: clm %r0, 0, 4095(%r15) # encoding: [0xbd,0x00,0xff,0xff]
+#CHECK: clm %r15, 0, 0 # encoding: [0xbd,0xf0,0x00,0x00]
+
+ clm %r0, 0, 0
+ clm %r0, 15, 4095
+ clm %r0, 0, 0(%r1)
+ clm %r0, 0, 0(%r15)
+ clm %r15, 15, 4095(%r1)
+ clm %r0, 0, 4095(%r15)
+ clm %r15, 0, 0
+
+#CHECK: clmh %r0, 0, -524288 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x20]
+#CHECK: clmh %r0, 0, -1 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x20]
+#CHECK: clmh %r0, 15, 0 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x20]
+#CHECK: clmh %r0, 15, 1 # encoding: [0xeb,0x0f,0x00,0x01,0x00,0x20]
+#CHECK: clmh %r0, 8, 524287 # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x20]
+#CHECK: clmh %r0, 8, 0(%r1) # encoding: [0xeb,0x08,0x10,0x00,0x00,0x20]
+#CHECK: clmh %r0, 4, 0(%r15) # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x20]
+#CHECK: clmh %r0, 4, 524287(%r15) # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x20]
+#CHECK: clmh %r0, 0, 524287(%r1) # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x20]
+#CHECK: clmh %r15, 0, 0 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0x20]
+
+ clmh %r0, 0, -524288
+ clmh %r0, 0, -1
+ clmh %r0, 15, 0
+ clmh %r0, 15, 1
+ clmh %r0, 8, 524287
+ clmh %r0, 8, 0(%r1)
+ clmh %r0, 4, 0(%r15)
+ clmh %r0, 4, 524287(%r15)
+ clmh %r0, 0, 524287(%r1)
+ clmh %r15, 0, 0
+
+#CHECK: clmy %r0, 0, -524288 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x21]
+#CHECK: clmy %r0, 0, -1 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x21]
+#CHECK: clmy %r0, 15, 0 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x21]
+#CHECK: clmy %r0, 15, 1 # encoding: [0xeb,0x0f,0x00,0x01,0x00,0x21]
+#CHECK: clmy %r0, 8, 524287 # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x21]
+#CHECK: clmy %r0, 8, 0(%r1) # encoding: [0xeb,0x08,0x10,0x00,0x00,0x21]
+#CHECK: clmy %r0, 4, 0(%r15) # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x21]
+#CHECK: clmy %r0, 4, 524287(%r15) # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x21]
+#CHECK: clmy %r0, 0, 524287(%r1) # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x21]
+#CHECK: clmy %r15, 0, 0 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0x21]
+
+ clmy %r0, 0, -524288
+ clmy %r0, 0, -1
+ clmy %r0, 15, 0
+ clmy %r0, 15, 1
+ clmy %r0, 8, 524287
+ clmy %r0, 8, 0(%r1)
+ clmy %r0, 4, 0(%r15)
+ clmy %r0, 4, 524287(%r15)
+ clmy %r0, 0, 524287(%r1)
+ clmy %r15, 0, 0
+
#CHECK: clr %r0, %r0 # encoding: [0x15,0x00]
#CHECK: clr %r0, %r15 # encoding: [0x15,0x0f]
#CHECK: clr %r15, %r0 # encoding: [0x15,0xf0]
@@ -5180,6 +5398,46 @@
cly %r0, 524287(%r15,%r1)
cly %r15, 0
+#CHECK: cmpsc %r0, %r8 # encoding: [0xb2,0x63,0x00,0x08]
+#CHECK: cmpsc %r0, %r14 # encoding: [0xb2,0x63,0x00,0x0e]
+#CHECK: cmpsc %r14, %r0 # encoding: [0xb2,0x63,0x00,0xe0]
+#CHECK: cmpsc %r14, %r8 # encoding: [0xb2,0x63,0x00,0xe8]
+
+ cmpsc %r0, %r8
+ cmpsc %r0, %r14
+ cmpsc %r14, %r0
+ cmpsc %r14, %r8
+
+#CHECK: cp 0(1), 0(1) # encoding: [0xf9,0x00,0x00,0x00,0x00,0x00]
+#CHECK: cp 0(1), 0(1,%r1) # encoding: [0xf9,0x00,0x00,0x00,0x10,0x00]
+#CHECK: cp 0(1), 0(1,%r15) # encoding: [0xf9,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: cp 0(1), 4095(1) # encoding: [0xf9,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: cp 0(1), 4095(1,%r1) # encoding: [0xf9,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: cp 0(1), 4095(1,%r15) # encoding: [0xf9,0x00,0x00,0x00,0xff,0xff]
+#CHECK: cp 0(1,%r1), 0(1) # encoding: [0xf9,0x00,0x10,0x00,0x00,0x00]
+#CHECK: cp 0(1,%r15), 0(1) # encoding: [0xf9,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: cp 4095(1,%r1), 0(1) # encoding: [0xf9,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: cp 4095(1,%r15), 0(1) # encoding: [0xf9,0x00,0xff,0xff,0x00,0x00]
+#CHECK: cp 0(16,%r1), 0(1) # encoding: [0xf9,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: cp 0(16,%r15), 0(1) # encoding: [0xf9,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: cp 0(1), 0(16,%r1) # encoding: [0xf9,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: cp 0(1), 0(16,%r15) # encoding: [0xf9,0x0f,0x00,0x00,0xf0,0x00]
+
+ cp 0(1), 0(1)
+ cp 0(1), 0(1,%r1)
+ cp 0(1), 0(1,%r15)
+ cp 0(1), 4095(1)
+ cp 0(1), 4095(1,%r1)
+ cp 0(1), 4095(1,%r15)
+ cp 0(1,%r1), 0(1)
+ cp 0(1,%r15), 0(1)
+ cp 4095(1,%r1), 0(1)
+ cp 4095(1,%r15), 0(1)
+ cp 0(16,%r1), 0(1)
+ cp 0(16,%r15), 0(1)
+ cp 0(1), 0(16,%r1)
+ cp 0(1), 0(16,%r15)
+
#CHECK: cpsdr %f0, %f0, %f0 # encoding: [0xb3,0x72,0x00,0x00]
#CHECK: cpsdr %f0, %f0, %f15 # encoding: [0xb3,0x72,0x00,0x0f]
#CHECK: cpsdr %f0, %f15, %f0 # encoding: [0xb3,0x72,0xf0,0x00]
@@ -5623,6 +5881,20 @@
csg %r0, %r15, 0
csg %r15, %r0, 0
+#CHECK: csst 0, 0, %r0 # encoding: [0xc8,0x02,0x00,0x00,0x00,0x00]
+#CHECK: csst 0(%r1), 0(%r15), %r2 # encoding: [0xc8,0x22,0x10,0x00,0xf0,0x00]
+#CHECK: csst 1(%r1), 0(%r15), %r2 # encoding: [0xc8,0x22,0x10,0x01,0xf0,0x00]
+#CHECK: csst 4095(%r1), 0(%r15), %r2 # encoding: [0xc8,0x22,0x1f,0xff,0xf0,0x00]
+#CHECK: csst 0(%r1), 1(%r15), %r2 # encoding: [0xc8,0x22,0x10,0x00,0xf0,0x01]
+#CHECK: csst 0(%r1), 4095(%r15), %r2 # encoding: [0xc8,0x22,0x10,0x00,0xff,0xff]
+
+ csst 0, 0, %r0
+ csst 0(%r1), 0(%r15), %r2
+ csst 1(%r1), 0(%r15), %r2
+ csst 4095(%r1), 0(%r15), %r2
+ csst 0(%r1), 1(%r15), %r2
+ csst 0(%r1), 4095(%r15), %r2
+
#CHECK: csy %r0, %r0, -524288 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x14]
#CHECK: csy %r0, %r0, -1 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x14]
#CHECK: csy %r0, %r0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0x14]
@@ -5647,19 +5919,239 @@
csy %r0, %r15, 0
csy %r15, %r0, 0
-#CHECK: csst 0, 0, %r0 # encoding: [0xc8,0x02,0x00,0x00,0x00,0x00]
-#CHECK: csst 0(%r1), 0(%r15), %r2 # encoding: [0xc8,0x22,0x10,0x00,0xf0,0x00]
-#CHECK: csst 1(%r1), 0(%r15), %r2 # encoding: [0xc8,0x22,0x10,0x01,0xf0,0x00]
-#CHECK: csst 4095(%r1), 0(%r15), %r2 # encoding: [0xc8,0x22,0x1f,0xff,0xf0,0x00]
-#CHECK: csst 0(%r1), 1(%r15), %r2 # encoding: [0xc8,0x22,0x10,0x00,0xf0,0x01]
-#CHECK: csst 0(%r1), 4095(%r15), %r2 # encoding: [0xc8,0x22,0x10,0x00,0xff,0xff]
-
- csst 0, 0, %r0
- csst 0(%r1), 0(%r15), %r2
- csst 1(%r1), 0(%r15), %r2
- csst 4095(%r1), 0(%r15), %r2
- csst 0(%r1), 1(%r15), %r2
- csst 0(%r1), 4095(%r15), %r2
+#CHECK: cu12 %r0, %r0 # encoding: [0xb2,0xa7,0x00,0x00]
+#CHECK: cu12 %r0, %r14 # encoding: [0xb2,0xa7,0x00,0x0e]
+#CHECK: cu12 %r14, %r0 # encoding: [0xb2,0xa7,0x00,0xe0]
+#CHECK: cu12 %r6, %r8 # encoding: [0xb2,0xa7,0x00,0x68]
+#CHECK: cu12 %r4, %r12, 0 # encoding: [0xb2,0xa7,0x00,0x4c]
+#CHECK: cu12 %r4, %r12, 15 # encoding: [0xb2,0xa7,0xf0,0x4c]
+
+ cu12 %r0, %r0
+ cu12 %r0, %r14
+ cu12 %r14, %r0
+ cu12 %r6, %r8
+ cu12 %r4, %r12, 0
+ cu12 %r4, %r12, 15
+
+#CHECK: cu14 %r0, %r0 # encoding: [0xb9,0xb0,0x00,0x00]
+#CHECK: cu14 %r0, %r14 # encoding: [0xb9,0xb0,0x00,0x0e]
+#CHECK: cu14 %r14, %r0 # encoding: [0xb9,0xb0,0x00,0xe0]
+#CHECK: cu14 %r6, %r8 # encoding: [0xb9,0xb0,0x00,0x68]
+#CHECK: cu14 %r4, %r12, 0 # encoding: [0xb9,0xb0,0x00,0x4c]
+#CHECK: cu14 %r4, %r12, 15 # encoding: [0xb9,0xb0,0xf0,0x4c]
+
+ cu14 %r0, %r0
+ cu14 %r0, %r14
+ cu14 %r14, %r0
+ cu14 %r6, %r8
+ cu14 %r4, %r12, 0
+ cu14 %r4, %r12, 15
+
+#CHECK: cu21 %r0, %r0 # encoding: [0xb2,0xa6,0x00,0x00]
+#CHECK: cu21 %r0, %r14 # encoding: [0xb2,0xa6,0x00,0x0e]
+#CHECK: cu21 %r14, %r0 # encoding: [0xb2,0xa6,0x00,0xe0]
+#CHECK: cu21 %r6, %r8 # encoding: [0xb2,0xa6,0x00,0x68]
+#CHECK: cu21 %r4, %r12, 0 # encoding: [0xb2,0xa6,0x00,0x4c]
+#CHECK: cu21 %r4, %r12, 15 # encoding: [0xb2,0xa6,0xf0,0x4c]
+
+ cu21 %r0, %r0
+ cu21 %r0, %r14
+ cu21 %r14, %r0
+ cu21 %r6, %r8
+ cu21 %r4, %r12, 0
+ cu21 %r4, %r12, 15
+
+#CHECK: cu24 %r0, %r0 # encoding: [0xb9,0xb1,0x00,0x00]
+#CHECK: cu24 %r0, %r14 # encoding: [0xb9,0xb1,0x00,0x0e]
+#CHECK: cu24 %r14, %r0 # encoding: [0xb9,0xb1,0x00,0xe0]
+#CHECK: cu24 %r6, %r8 # encoding: [0xb9,0xb1,0x00,0x68]
+#CHECK: cu24 %r4, %r12, 0 # encoding: [0xb9,0xb1,0x00,0x4c]
+#CHECK: cu24 %r4, %r12, 15 # encoding: [0xb9,0xb1,0xf0,0x4c]
+
+ cu24 %r0, %r0
+ cu24 %r0, %r14
+ cu24 %r14, %r0
+ cu24 %r6, %r8
+ cu24 %r4, %r12, 0
+ cu24 %r4, %r12, 15
+
+#CHECK: cu41 %r0, %r0 # encoding: [0xb9,0xb2,0x00,0x00]
+#CHECK: cu41 %r0, %r14 # encoding: [0xb9,0xb2,0x00,0x0e]
+#CHECK: cu41 %r14, %r0 # encoding: [0xb9,0xb2,0x00,0xe0]
+#CHECK: cu41 %r6, %r8 # encoding: [0xb9,0xb2,0x00,0x68]
+
+ cu41 %r0, %r0
+ cu41 %r0, %r14
+ cu41 %r14, %r0
+ cu41 %r6, %r8
+
+#CHECK: cu42 %r0, %r0 # encoding: [0xb9,0xb3,0x00,0x00]
+#CHECK: cu42 %r0, %r14 # encoding: [0xb9,0xb3,0x00,0x0e]
+#CHECK: cu42 %r14, %r0 # encoding: [0xb9,0xb3,0x00,0xe0]
+#CHECK: cu42 %r6, %r8 # encoding: [0xb9,0xb3,0x00,0x68]
+
+ cu42 %r0, %r0
+ cu42 %r0, %r14
+ cu42 %r14, %r0
+ cu42 %r6, %r8
+
+#CHECK: cuse %r0, %r8 # encoding: [0xb2,0x57,0x00,0x08]
+#CHECK: cuse %r0, %r14 # encoding: [0xb2,0x57,0x00,0x0e]
+#CHECK: cuse %r14, %r0 # encoding: [0xb2,0x57,0x00,0xe0]
+#CHECK: cuse %r14, %r8 # encoding: [0xb2,0x57,0x00,0xe8]
+
+ cuse %r0, %r8
+ cuse %r0, %r14
+ cuse %r14, %r0
+ cuse %r14, %r8
+
+#CHECK: cutfu %r0, %r0 # encoding: [0xb2,0xa7,0x00,0x00]
+#CHECK: cutfu %r0, %r14 # encoding: [0xb2,0xa7,0x00,0x0e]
+#CHECK: cutfu %r14, %r0 # encoding: [0xb2,0xa7,0x00,0xe0]
+#CHECK: cutfu %r6, %r8 # encoding: [0xb2,0xa7,0x00,0x68]
+#CHECK: cutfu %r4, %r12, 0 # encoding: [0xb2,0xa7,0x00,0x4c]
+#CHECK: cutfu %r4, %r12, 15 # encoding: [0xb2,0xa7,0xf0,0x4c]
+
+ cutfu %r0, %r0
+ cutfu %r0, %r14
+ cutfu %r14, %r0
+ cutfu %r6, %r8
+ cutfu %r4, %r12, 0
+ cutfu %r4, %r12, 15
+
+#CHECK: cuutf %r0, %r0 # encoding: [0xb2,0xa6,0x00,0x00]
+#CHECK: cuutf %r0, %r14 # encoding: [0xb2,0xa6,0x00,0x0e]
+#CHECK: cuutf %r14, %r0 # encoding: [0xb2,0xa6,0x00,0xe0]
+#CHECK: cuutf %r6, %r8 # encoding: [0xb2,0xa6,0x00,0x68]
+#CHECK: cuutf %r4, %r12, 0 # encoding: [0xb2,0xa6,0x00,0x4c]
+#CHECK: cuutf %r4, %r12, 15 # encoding: [0xb2,0xa6,0xf0,0x4c]
+
+ cuutf %r0, %r0
+ cuutf %r0, %r14
+ cuutf %r14, %r0
+ cuutf %r6, %r8
+ cuutf %r4, %r12, 0
+ cuutf %r4, %r12, 15
+
+#CHECK: cvb %r0, 0 # encoding: [0x4f,0x00,0x00,0x00]
+#CHECK: cvb %r0, 4095 # encoding: [0x4f,0x00,0x0f,0xff]
+#CHECK: cvb %r0, 0(%r1) # encoding: [0x4f,0x00,0x10,0x00]
+#CHECK: cvb %r0, 0(%r15) # encoding: [0x4f,0x00,0xf0,0x00]
+#CHECK: cvb %r0, 4095(%r1,%r15) # encoding: [0x4f,0x01,0xff,0xff]
+#CHECK: cvb %r0, 4095(%r15,%r1) # encoding: [0x4f,0x0f,0x1f,0xff]
+#CHECK: cvb %r15, 0 # encoding: [0x4f,0xf0,0x00,0x00]
+
+ cvb %r0, 0
+ cvb %r0, 4095
+ cvb %r0, 0(%r1)
+ cvb %r0, 0(%r15)
+ cvb %r0, 4095(%r1,%r15)
+ cvb %r0, 4095(%r15,%r1)
+ cvb %r15, 0
+
+#CHECK: cvbg %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x0e]
+#CHECK: cvbg %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x0e]
+#CHECK: cvbg %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x0e]
+#CHECK: cvbg %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x0e]
+#CHECK: cvbg %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x0e]
+#CHECK: cvbg %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x0e]
+#CHECK: cvbg %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x0e]
+#CHECK: cvbg %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x0e]
+#CHECK: cvbg %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x0e]
+#CHECK: cvbg %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x0e]
+
+ cvbg %r0, -524288
+ cvbg %r0, -1
+ cvbg %r0, 0
+ cvbg %r0, 1
+ cvbg %r0, 524287
+ cvbg %r0, 0(%r1)
+ cvbg %r0, 0(%r15)
+ cvbg %r0, 524287(%r1,%r15)
+ cvbg %r0, 524287(%r15,%r1)
+ cvbg %r15, 0
+
+#CHECK: cvby %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x06]
+#CHECK: cvby %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x06]
+#CHECK: cvby %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x06]
+#CHECK: cvby %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x06]
+#CHECK: cvby %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x06]
+#CHECK: cvby %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x06]
+#CHECK: cvby %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x06]
+#CHECK: cvby %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x06]
+#CHECK: cvby %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x06]
+#CHECK: cvby %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x06]
+
+ cvby %r0, -524288
+ cvby %r0, -1
+ cvby %r0, 0
+ cvby %r0, 1
+ cvby %r0, 524287
+ cvby %r0, 0(%r1)
+ cvby %r0, 0(%r15)
+ cvby %r0, 524287(%r1,%r15)
+ cvby %r0, 524287(%r15,%r1)
+ cvby %r15, 0
+
+#CHECK: cvd %r0, 0 # encoding: [0x4e,0x00,0x00,0x00]
+#CHECK: cvd %r0, 4095 # encoding: [0x4e,0x00,0x0f,0xff]
+#CHECK: cvd %r0, 0(%r1) # encoding: [0x4e,0x00,0x10,0x00]
+#CHECK: cvd %r0, 0(%r15) # encoding: [0x4e,0x00,0xf0,0x00]
+#CHECK: cvd %r0, 4095(%r1,%r15) # encoding: [0x4e,0x01,0xff,0xff]
+#CHECK: cvd %r0, 4095(%r15,%r1) # encoding: [0x4e,0x0f,0x1f,0xff]
+#CHECK: cvd %r15, 0 # encoding: [0x4e,0xf0,0x00,0x00]
+
+ cvd %r0, 0
+ cvd %r0, 4095
+ cvd %r0, 0(%r1)
+ cvd %r0, 0(%r15)
+ cvd %r0, 4095(%r1,%r15)
+ cvd %r0, 4095(%r15,%r1)
+ cvd %r15, 0
+
+#CHECK: cvdg %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x2e]
+#CHECK: cvdg %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x2e]
+#CHECK: cvdg %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x2e]
+#CHECK: cvdg %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x2e]
+#CHECK: cvdg %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x2e]
+#CHECK: cvdg %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x2e]
+#CHECK: cvdg %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x2e]
+#CHECK: cvdg %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x2e]
+#CHECK: cvdg %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x2e]
+#CHECK: cvdg %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x2e]
+
+ cvdg %r0, -524288
+ cvdg %r0, -1
+ cvdg %r0, 0
+ cvdg %r0, 1
+ cvdg %r0, 524287
+ cvdg %r0, 0(%r1)
+ cvdg %r0, 0(%r15)
+ cvdg %r0, 524287(%r1,%r15)
+ cvdg %r0, 524287(%r15,%r1)
+ cvdg %r15, 0
+
+#CHECK: cvdy %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x26]
+#CHECK: cvdy %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x26]
+#CHECK: cvdy %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x26]
+#CHECK: cvdy %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x26]
+#CHECK: cvdy %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x26]
+#CHECK: cvdy %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x26]
+#CHECK: cvdy %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x26]
+#CHECK: cvdy %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x26]
+#CHECK: cvdy %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x26]
+#CHECK: cvdy %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x26]
+
+ cvdy %r0, -524288
+ cvdy %r0, -1
+ cvdy %r0, 0
+ cvdy %r0, 1
+ cvdy %r0, 524287
+ cvdy %r0, 0(%r1)
+ cvdy %r0, 0(%r15)
+ cvdy %r0, 524287(%r1,%r15)
+ cvdy %r0, 524287(%r15,%r1)
+ cvdy %r15, 0
#CHECK: cxbr %f0, %f0 # encoding: [0xb3,0x49,0x00,0x00]
#CHECK: cxbr %f0, %f13 # encoding: [0xb3,0x49,0x00,0x0d]
@@ -5717,6 +6209,22 @@
cy %r0, 524287(%r15,%r1)
cy %r15, 0
+#CHECK: d %r0, 0 # encoding: [0x5d,0x00,0x00,0x00]
+#CHECK: d %r0, 4095 # encoding: [0x5d,0x00,0x0f,0xff]
+#CHECK: d %r0, 0(%r1) # encoding: [0x5d,0x00,0x10,0x00]
+#CHECK: d %r0, 0(%r15) # encoding: [0x5d,0x00,0xf0,0x00]
+#CHECK: d %r0, 4095(%r1,%r15) # encoding: [0x5d,0x01,0xff,0xff]
+#CHECK: d %r0, 4095(%r15,%r1) # encoding: [0x5d,0x0f,0x1f,0xff]
+#CHECK: d %r14, 0 # encoding: [0x5d,0xe0,0x00,0x00]
+
+ d %r0, 0
+ d %r0, 4095
+ d %r0, 0(%r1)
+ d %r0, 0(%r15)
+ d %r0, 4095(%r1,%r15)
+ d %r0, 4095(%r15,%r1)
+ d %r14, 0
+
#CHECK: ddb %f0, 0 # encoding: [0xed,0x00,0x00,0x00,0x00,0x1d]
#CHECK: ddb %f0, 4095 # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1d]
#CHECK: ddb %f0, 0(%r1) # encoding: [0xed,0x00,0x10,0x00,0x00,0x1d]
@@ -5769,6 +6277,34 @@
debr %f7, %f8
debr %f15, %f0
+#CHECK: didbr %f0, %f0, %f0, 0 # encoding: [0xb3,0x5b,0x00,0x00]
+#CHECK: didbr %f0, %f0, %f0, 15 # encoding: [0xb3,0x5b,0x0f,0x00]
+#CHECK: didbr %f0, %f0, %f15, 0 # encoding: [0xb3,0x5b,0x00,0x0f]
+#CHECK: didbr %f0, %f15, %f0, 0 # encoding: [0xb3,0x5b,0xf0,0x00]
+#CHECK: didbr %f4, %f5, %f6, 7 # encoding: [0xb3,0x5b,0x57,0x46]
+#CHECK: didbr %f15, %f0, %f0, 0 # encoding: [0xb3,0x5b,0x00,0xf0]
+
+ didbr %f0, %f0, %f0, 0
+ didbr %f0, %f0, %f0, 15
+ didbr %f0, %f0, %f15, 0
+ didbr %f0, %f15, %f0, 0
+ didbr %f4, %f5, %f6, 7
+ didbr %f15, %f0, %f0, 0
+
+#CHECK: diebr %f0, %f0, %f0, 0 # encoding: [0xb3,0x53,0x00,0x00]
+#CHECK: diebr %f0, %f0, %f0, 15 # encoding: [0xb3,0x53,0x0f,0x00]
+#CHECK: diebr %f0, %f0, %f15, 0 # encoding: [0xb3,0x53,0x00,0x0f]
+#CHECK: diebr %f0, %f15, %f0, 0 # encoding: [0xb3,0x53,0xf0,0x00]
+#CHECK: diebr %f4, %f5, %f6, 7 # encoding: [0xb3,0x53,0x57,0x46]
+#CHECK: diebr %f15, %f0, %f0, 0 # encoding: [0xb3,0x53,0x00,0xf0]
+
+ diebr %f0, %f0, %f0, 0
+ diebr %f0, %f0, %f0, 15
+ diebr %f0, %f0, %f15, 0
+ diebr %f0, %f15, %f0, 0
+ diebr %f4, %f5, %f6, 7
+ diebr %f15, %f0, %f0, 0
+
#CHECK: dl %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x97]
#CHECK: dl %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x97]
#CHECK: dl %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x97]
@@ -5833,6 +6369,46 @@
dlr %r14,%r0
dlr %r6,%r9
+#CHECK: dp 0(1), 0(1) # encoding: [0xfd,0x00,0x00,0x00,0x00,0x00]
+#CHECK: dp 0(1), 0(1,%r1) # encoding: [0xfd,0x00,0x00,0x00,0x10,0x00]
+#CHECK: dp 0(1), 0(1,%r15) # encoding: [0xfd,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: dp 0(1), 4095(1) # encoding: [0xfd,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: dp 0(1), 4095(1,%r1) # encoding: [0xfd,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: dp 0(1), 4095(1,%r15) # encoding: [0xfd,0x00,0x00,0x00,0xff,0xff]
+#CHECK: dp 0(1,%r1), 0(1) # encoding: [0xfd,0x00,0x10,0x00,0x00,0x00]
+#CHECK: dp 0(1,%r15), 0(1) # encoding: [0xfd,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: dp 4095(1,%r1), 0(1) # encoding: [0xfd,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: dp 4095(1,%r15), 0(1) # encoding: [0xfd,0x00,0xff,0xff,0x00,0x00]
+#CHECK: dp 0(16,%r1), 0(1) # encoding: [0xfd,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: dp 0(16,%r15), 0(1) # encoding: [0xfd,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: dp 0(1), 0(16,%r1) # encoding: [0xfd,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: dp 0(1), 0(16,%r15) # encoding: [0xfd,0x0f,0x00,0x00,0xf0,0x00]
+
+ dp 0(1), 0(1)
+ dp 0(1), 0(1,%r1)
+ dp 0(1), 0(1,%r15)
+ dp 0(1), 4095(1)
+ dp 0(1), 4095(1,%r1)
+ dp 0(1), 4095(1,%r15)
+ dp 0(1,%r1), 0(1)
+ dp 0(1,%r15), 0(1)
+ dp 4095(1,%r1), 0(1)
+ dp 4095(1,%r15), 0(1)
+ dp 0(16,%r1), 0(1)
+ dp 0(16,%r15), 0(1)
+ dp 0(1), 0(16,%r1)
+ dp 0(1), 0(16,%r15)
+
+#CHECK: dr %r0, %r0 # encoding: [0x1d,0x00]
+#CHECK: dr %r0, %r15 # encoding: [0x1d,0x0f]
+#CHECK: dr %r14, %r0 # encoding: [0x1d,0xe0]
+#CHECK: dr %r6, %r9 # encoding: [0x1d,0x69]
+
+ dr %r0,%r0
+ dr %r0,%r15
+ dr %r14,%r0
+ dr %r6,%r9
+
#CHECK: dsg %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x0d]
#CHECK: dsg %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x0d]
#CHECK: dsg %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x0d]
@@ -5919,6 +6495,34 @@
ear %r7, %a8
ear %r15, %a15
+#CHECK: ecag %r0, %r0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0x4c]
+#CHECK: ecag %r0, %r15, 0 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x4c]
+#CHECK: ecag %r14, %r15, 0 # encoding: [0xeb,0xef,0x00,0x00,0x00,0x4c]
+#CHECK: ecag %r15, %r15, 0 # encoding: [0xeb,0xff,0x00,0x00,0x00,0x4c]
+#CHECK: ecag %r0, %r0, -524288 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x4c]
+#CHECK: ecag %r0, %r0, -1 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x4c]
+#CHECK: ecag %r0, %r0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0x4c]
+#CHECK: ecag %r0, %r0, 1 # encoding: [0xeb,0x00,0x00,0x01,0x00,0x4c]
+#CHECK: ecag %r0, %r0, 524287 # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0x4c]
+#CHECK: ecag %r0, %r0, 0(%r1) # encoding: [0xeb,0x00,0x10,0x00,0x00,0x4c]
+#CHECK: ecag %r0, %r0, 0(%r15) # encoding: [0xeb,0x00,0xf0,0x00,0x00,0x4c]
+#CHECK: ecag %r0, %r0, 524287(%r1) # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x4c]
+#CHECK: ecag %r0, %r0, 524287(%r15) # encoding: [0xeb,0x00,0xff,0xff,0x7f,0x4c]
+
+ ecag %r0,%r0,0
+ ecag %r0,%r15,0
+ ecag %r14,%r15,0
+ ecag %r15,%r15,0
+ ecag %r0,%r0,-524288
+ ecag %r0,%r0,-1
+ ecag %r0,%r0,0
+ ecag %r0,%r0,1
+ ecag %r0,%r0,524287
+ ecag %r0,%r0,0(%r1)
+ ecag %r0,%r0,0(%r15)
+ ecag %r0,%r0,524287(%r1)
+ ecag %r0,%r0,524287(%r15)
+
#CHECK: ectg 0, 0, %r0 # encoding: [0xc8,0x01,0x00,0x00,0x00,0x00]
#CHECK: ectg 0(%r1), 0(%r15), %r2 # encoding: [0xc8,0x21,0x10,0x00,0xf0,0x00]
#CHECK: ectg 1(%r1), 0(%r15), %r2 # encoding: [0xc8,0x21,0x10,0x01,0xf0,0x00]
@@ -5933,6 +6537,58 @@
ectg 0(%r1),1(%r15),%r2
ectg 0(%r1),4095(%r15),%r2
+#CHECK: ed 0(1), 0 # encoding: [0xde,0x00,0x00,0x00,0x00,0x00]
+#CHECK: ed 0(1), 0(%r1) # encoding: [0xde,0x00,0x00,0x00,0x10,0x00]
+#CHECK: ed 0(1), 0(%r15) # encoding: [0xde,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: ed 0(1), 4095 # encoding: [0xde,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: ed 0(1), 4095(%r1) # encoding: [0xde,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: ed 0(1), 4095(%r15) # encoding: [0xde,0x00,0x00,0x00,0xff,0xff]
+#CHECK: ed 0(1,%r1), 0 # encoding: [0xde,0x00,0x10,0x00,0x00,0x00]
+#CHECK: ed 0(1,%r15), 0 # encoding: [0xde,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: ed 4095(1,%r1), 0 # encoding: [0xde,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: ed 4095(1,%r15), 0 # encoding: [0xde,0x00,0xff,0xff,0x00,0x00]
+#CHECK: ed 0(256,%r1), 0 # encoding: [0xde,0xff,0x10,0x00,0x00,0x00]
+#CHECK: ed 0(256,%r15), 0 # encoding: [0xde,0xff,0xf0,0x00,0x00,0x00]
+
+ ed 0(1), 0
+ ed 0(1), 0(%r1)
+ ed 0(1), 0(%r15)
+ ed 0(1), 4095
+ ed 0(1), 4095(%r1)
+ ed 0(1), 4095(%r15)
+ ed 0(1,%r1), 0
+ ed 0(1,%r15), 0
+ ed 4095(1,%r1), 0
+ ed 4095(1,%r15), 0
+ ed 0(256,%r1), 0
+ ed 0(256,%r15), 0
+
+#CHECK: edmk 0(1), 0 # encoding: [0xdf,0x00,0x00,0x00,0x00,0x00]
+#CHECK: edmk 0(1), 0(%r1) # encoding: [0xdf,0x00,0x00,0x00,0x10,0x00]
+#CHECK: edmk 0(1), 0(%r15) # encoding: [0xdf,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: edmk 0(1), 4095 # encoding: [0xdf,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: edmk 0(1), 4095(%r1) # encoding: [0xdf,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: edmk 0(1), 4095(%r15) # encoding: [0xdf,0x00,0x00,0x00,0xff,0xff]
+#CHECK: edmk 0(1,%r1), 0 # encoding: [0xdf,0x00,0x10,0x00,0x00,0x00]
+#CHECK: edmk 0(1,%r15), 0 # encoding: [0xdf,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: edmk 4095(1,%r1), 0 # encoding: [0xdf,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: edmk 4095(1,%r15), 0 # encoding: [0xdf,0x00,0xff,0xff,0x00,0x00]
+#CHECK: edmk 0(256,%r1), 0 # encoding: [0xdf,0xff,0x10,0x00,0x00,0x00]
+#CHECK: edmk 0(256,%r15), 0 # encoding: [0xdf,0xff,0xf0,0x00,0x00,0x00]
+
+ edmk 0(1), 0
+ edmk 0(1), 0(%r1)
+ edmk 0(1), 0(%r15)
+ edmk 0(1), 4095
+ edmk 0(1), 4095(%r1)
+ edmk 0(1), 4095(%r15)
+ edmk 0(1,%r1), 0
+ edmk 0(1,%r15), 0
+ edmk 4095(1,%r1), 0
+ edmk 4095(1,%r15), 0
+ edmk 0(256,%r1), 0
+ edmk 0(256,%r15), 0
+
#CHECK: efpc %r0 # encoding: [0xb3,0x8c,0x00,0x00]
#CHECK: efpc %r1 # encoding: [0xb3,0x8c,0x00,0x10]
#CHECK: efpc %r15 # encoding: [0xb3,0x8c,0x00,0xf0]
@@ -5941,6 +6597,16 @@
efpc %r1
efpc %r15
+#CHECK: epsw %r0, %r8 # encoding: [0xb9,0x8d,0x00,0x08]
+#CHECK: epsw %r0, %r15 # encoding: [0xb9,0x8d,0x00,0x0f]
+#CHECK: epsw %r15, %r0 # encoding: [0xb9,0x8d,0x00,0xf0]
+#CHECK: epsw %r15, %r8 # encoding: [0xb9,0x8d,0x00,0xf8]
+
+ epsw %r0, %r8
+ epsw %r0, %r15
+ epsw %r15, %r0
+ epsw %r15, %r8
+
#CHECK: ex %r0, 0 # encoding: [0x44,0x00,0x00,0x00]
#CHECK: ex %r0, 4095 # encoding: [0x44,0x00,0x0f,0xff]
#CHECK: ex %r0, 0(%r1) # encoding: [0x44,0x00,0x10,0x00]
@@ -6202,6 +6868,118 @@
ipm %r1
ipm %r15
+#CHECK: kdb %f0, 0 # encoding: [0xed,0x00,0x00,0x00,0x00,0x18]
+#CHECK: kdb %f0, 4095 # encoding: [0xed,0x00,0x0f,0xff,0x00,0x18]
+#CHECK: kdb %f0, 0(%r1) # encoding: [0xed,0x00,0x10,0x00,0x00,0x18]
+#CHECK: kdb %f0, 0(%r15) # encoding: [0xed,0x00,0xf0,0x00,0x00,0x18]
+#CHECK: kdb %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x18]
+#CHECK: kdb %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x18]
+#CHECK: kdb %f15, 0 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x18]
+
+ kdb %f0, 0
+ kdb %f0, 4095
+ kdb %f0, 0(%r1)
+ kdb %f0, 0(%r15)
+ kdb %f0, 4095(%r1,%r15)
+ kdb %f0, 4095(%r15,%r1)
+ kdb %f15, 0
+
+#CHECK: kdbr %f0, %f0 # encoding: [0xb3,0x18,0x00,0x00]
+#CHECK: kdbr %f0, %f15 # encoding: [0xb3,0x18,0x00,0x0f]
+#CHECK: kdbr %f7, %f8 # encoding: [0xb3,0x18,0x00,0x78]
+#CHECK: kdbr %f15, %f0 # encoding: [0xb3,0x18,0x00,0xf0]
+
+ kdbr %f0, %f0
+ kdbr %f0, %f15
+ kdbr %f7, %f8
+ kdbr %f15, %f0
+
+#CHECK: keb %f0, 0 # encoding: [0xed,0x00,0x00,0x00,0x00,0x08]
+#CHECK: keb %f0, 4095 # encoding: [0xed,0x00,0x0f,0xff,0x00,0x08]
+#CHECK: keb %f0, 0(%r1) # encoding: [0xed,0x00,0x10,0x00,0x00,0x08]
+#CHECK: keb %f0, 0(%r15) # encoding: [0xed,0x00,0xf0,0x00,0x00,0x08]
+#CHECK: keb %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x08]
+#CHECK: keb %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x08]
+#CHECK: keb %f15, 0 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x08]
+
+ keb %f0, 0
+ keb %f0, 4095
+ keb %f0, 0(%r1)
+ keb %f0, 0(%r15)
+ keb %f0, 4095(%r1,%r15)
+ keb %f0, 4095(%r15,%r1)
+ keb %f15, 0
+
+#CHECK: kebr %f0, %f0 # encoding: [0xb3,0x08,0x00,0x00]
+#CHECK: kebr %f0, %f15 # encoding: [0xb3,0x08,0x00,0x0f]
+#CHECK: kebr %f7, %f8 # encoding: [0xb3,0x08,0x00,0x78]
+#CHECK: kebr %f15, %f0 # encoding: [0xb3,0x08,0x00,0xf0]
+
+ kebr %f0, %f0
+ kebr %f0, %f15
+ kebr %f7, %f8
+ kebr %f15, %f0
+
+#CHECK: kimd %r0, %r2 # encoding: [0xb9,0x3e,0x00,0x02]
+#CHECK: kimd %r0, %r14 # encoding: [0xb9,0x3e,0x00,0x0e]
+#CHECK: kimd %r15, %r2 # encoding: [0xb9,0x3e,0x00,0xf2]
+#CHECK: kimd %r7, %r10 # encoding: [0xb9,0x3e,0x00,0x7a]
+
+ kimd %r0, %r2
+ kimd %r0, %r14
+ kimd %r15, %r2
+ kimd %r7, %r10
+
+#CHECK: klmd %r0, %r2 # encoding: [0xb9,0x3f,0x00,0x02]
+#CHECK: klmd %r0, %r14 # encoding: [0xb9,0x3f,0x00,0x0e]
+#CHECK: klmd %r15, %r2 # encoding: [0xb9,0x3f,0x00,0xf2]
+#CHECK: klmd %r7, %r10 # encoding: [0xb9,0x3f,0x00,0x7a]
+
+ klmd %r0, %r2
+ klmd %r0, %r14
+ klmd %r15, %r2
+ klmd %r7, %r10
+
+#CHECK: km %r2, %r2 # encoding: [0xb9,0x2e,0x00,0x22]
+#CHECK: km %r2, %r14 # encoding: [0xb9,0x2e,0x00,0x2e]
+#CHECK: km %r14, %r2 # encoding: [0xb9,0x2e,0x00,0xe2]
+#CHECK: km %r6, %r10 # encoding: [0xb9,0x2e,0x00,0x6a]
+
+ km %r2, %r2
+ km %r2, %r14
+ km %r14, %r2
+ km %r6, %r10
+
+#CHECK: kmac %r0, %r2 # encoding: [0xb9,0x1e,0x00,0x02]
+#CHECK: kmac %r0, %r14 # encoding: [0xb9,0x1e,0x00,0x0e]
+#CHECK: kmac %r15, %r2 # encoding: [0xb9,0x1e,0x00,0xf2]
+#CHECK: kmac %r7, %r10 # encoding: [0xb9,0x1e,0x00,0x7a]
+
+ kmac %r0, %r2
+ kmac %r0, %r14
+ kmac %r15, %r2
+ kmac %r7, %r10
+
+#CHECK: kmc %r2, %r2 # encoding: [0xb9,0x2f,0x00,0x22]
+#CHECK: kmc %r2, %r14 # encoding: [0xb9,0x2f,0x00,0x2e]
+#CHECK: kmc %r14, %r2 # encoding: [0xb9,0x2f,0x00,0xe2]
+#CHECK: kmc %r6, %r10 # encoding: [0xb9,0x2f,0x00,0x6a]
+
+ kmc %r2, %r2
+ kmc %r2, %r14
+ kmc %r14, %r2
+ kmc %r6, %r10
+
+#CHECK: kxbr %f0, %f0 # encoding: [0xb3,0x48,0x00,0x00]
+#CHECK: kxbr %f0, %f13 # encoding: [0xb3,0x48,0x00,0x0d]
+#CHECK: kxbr %f8, %f8 # encoding: [0xb3,0x48,0x00,0x88]
+#CHECK: kxbr %f13, %f0 # encoding: [0xb3,0x48,0x00,0xd0]
+
+ kxbr %f0, %f0
+ kxbr %f0, %f13
+ kxbr %f8, %f8
+ kxbr %f13, %f0
+
#CHECK: l %r0, 0 # encoding: [0x58,0x00,0x00,0x00]
#CHECK: l %r0, 4095 # encoding: [0x58,0x00,0x0f,0xff]
#CHECK: l %r0, 0(%r1) # encoding: [0x58,0x00,0x10,0x00]
@@ -7101,36 +7879,6 @@
llgcr %r7, %r8
llgcr %r15, %r0
-#CHECK: llgt %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x17]
-#CHECK: llgt %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x17]
-#CHECK: llgt %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x17]
-#CHECK: llgt %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x17]
-#CHECK: llgt %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x17]
-#CHECK: llgt %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x17]
-#CHECK: llgt %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x17]
-#CHECK: llgt %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x17]
-#CHECK: llgt %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x17]
-#CHECK: llgt %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x17]
-
- llgt %r0, -524288
- llgt %r0, -1
- llgt %r0, 0
- llgt %r0, 1
- llgt %r0, 524287
- llgt %r0, 0(%r1)
- llgt %r0, 0(%r15)
- llgt %r0, 524287(%r1,%r15)
- llgt %r0, 524287(%r15,%r1)
- llgt %r15, 0
-
-#CHECK: llgtr %r0, %r15 # encoding: [0xb9,0x17,0x00,0x0f]
-#CHECK: llgtr %r7, %r8 # encoding: [0xb9,0x17,0x00,0x78]
-#CHECK: llgtr %r15, %r0 # encoding: [0xb9,0x17,0x00,0xf0]
-
- llgtr %r0, %r15
- llgtr %r7, %r8
- llgtr %r15, %r0
-
#CHECK: llgf %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x16]
#CHECK: llgf %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x16]
#CHECK: llgf %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x16]
@@ -7221,7 +7969,6 @@
llgh %r0, 524287(%r15,%r1)
llgh %r15, 0
-
#CHECK: llghr %r0, %r15 # encoding: [0xb9,0x85,0x00,0x0f]
#CHECK: llghr %r7, %r8 # encoding: [0xb9,0x85,0x00,0x78]
#CHECK: llghr %r15, %r0 # encoding: [0xb9,0x85,0x00,0xf0]
@@ -7267,6 +8014,36 @@
llghrl %r7,frob@PLT
llghrl %r8,frob@PLT
+#CHECK: llgt %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x17]
+#CHECK: llgt %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x17]
+#CHECK: llgt %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x17]
+#CHECK: llgt %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x17]
+#CHECK: llgt %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x17]
+#CHECK: llgt %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x17]
+#CHECK: llgt %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x17]
+#CHECK: llgt %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x17]
+#CHECK: llgt %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x17]
+#CHECK: llgt %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x17]
+
+ llgt %r0, -524288
+ llgt %r0, -1
+ llgt %r0, 0
+ llgt %r0, 1
+ llgt %r0, 524287
+ llgt %r0, 0(%r1)
+ llgt %r0, 0(%r15)
+ llgt %r0, 524287(%r1,%r15)
+ llgt %r0, 524287(%r15,%r1)
+ llgt %r15, 0
+
+#CHECK: llgtr %r0, %r15 # encoding: [0xb9,0x17,0x00,0x0f]
+#CHECK: llgtr %r7, %r8 # encoding: [0xb9,0x17,0x00,0x78]
+#CHECK: llgtr %r15, %r0 # encoding: [0xb9,0x17,0x00,0xf0]
+
+ llgtr %r0, %r15
+ llgtr %r7, %r8
+ llgtr %r15, %r0
+
#CHECK: llh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x95]
#CHECK: llh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x95]
#CHECK: llh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x95]
@@ -7412,6 +8189,26 @@
lm %r0,%r0,4095(%r1)
lm %r0,%r0,4095(%r15)
+#CHECK: lmd %r0, %r0, 0, 0 # encoding: [0xef,0x00,0x00,0x00,0x00,0x00]
+#CHECK: lmd %r0, %r15, 0, 0 # encoding: [0xef,0x0f,0x00,0x00,0x00,0x00]
+#CHECK: lmd %r14, %r15, 0, 0 # encoding: [0xef,0xef,0x00,0x00,0x00,0x00]
+#CHECK: lmd %r15, %r15, 0, 0 # encoding: [0xef,0xff,0x00,0x00,0x00,0x00]
+#CHECK: lmd %r2, %r4, 0(%r1), 0(%r15) # encoding: [0xef,0x24,0x10,0x00,0xf0,0x00]
+#CHECK: lmd %r2, %r4, 1(%r1), 0(%r15) # encoding: [0xef,0x24,0x10,0x01,0xf0,0x00]
+#CHECK: lmd %r2, %r4, 4095(%r1), 0(%r15) # encoding: [0xef,0x24,0x1f,0xff,0xf0,0x00]
+#CHECK: lmd %r2, %r4, 0(%r1), 1(%r15) # encoding: [0xef,0x24,0x10,0x00,0xf0,0x01]
+#CHECK: lmd %r2, %r4, 0(%r1), 4095(%r15) # encoding: [0xef,0x24,0x10,0x00,0xff,0xff]
+
+ lmd %r0, %r0, 0, 0
+ lmd %r0, %r15, 0, 0
+ lmd %r14, %r15, 0, 0
+ lmd %r15, %r15, 0, 0
+ lmd %r2, %r4, 0(%r1), 0(%r15)
+ lmd %r2, %r4, 1(%r1), 0(%r15)
+ lmd %r2, %r4, 4095(%r1), 0(%r15)
+ lmd %r2, %r4, 0(%r1), 1(%r15)
+ lmd %r2, %r4, 0(%r1), 4095(%r15)
+
#CHECK: lmg %r0, %r0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0x04]
#CHECK: lmg %r0, %r15, 0 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x04]
#CHECK: lmg %r14, %r15, 0 # encoding: [0xeb,0xef,0x00,0x00,0x00,0x04]
@@ -7685,28 +8482,6 @@
lrl %r7,frob@PLT
lrl %r8,frob@PLT
-#CHECK: lrvh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x1f]
-#CHECK: lrvh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x1f]
-#CHECK: lrvh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x1f]
-#CHECK: lrvh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x1f]
-#CHECK: lrvh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x1f]
-#CHECK: lrvh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x1f]
-#CHECK: lrvh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x1f]
-#CHECK: lrvh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x1f]
-#CHECK: lrvh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x1f]
-#CHECK: lrvh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x1f]
-
- lrvh %r0,-524288
- lrvh %r0,-1
- lrvh %r0,0
- lrvh %r0,1
- lrvh %r0,524287
- lrvh %r0,0(%r1)
- lrvh %r0,0(%r15)
- lrvh %r0,524287(%r1,%r15)
- lrvh %r0,524287(%r15,%r1)
- lrvh %r15,0
-
#CHECK: lrv %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x1e]
#CHECK: lrv %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x1e]
#CHECK: lrv %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x1e]
@@ -7763,6 +8538,28 @@
lrvgr %r7,%r8
lrvgr %r15,%r15
+#CHECK: lrvh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x1f]
+#CHECK: lrvh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x1f]
+#CHECK: lrvh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x1f]
+#CHECK: lrvh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x1f]
+#CHECK: lrvh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x1f]
+#CHECK: lrvh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x1f]
+#CHECK: lrvh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x1f]
+#CHECK: lrvh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x1f]
+#CHECK: lrvh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x1f]
+#CHECK: lrvh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x1f]
+
+ lrvh %r0,-524288
+ lrvh %r0,-1
+ lrvh %r0,0
+ lrvh %r0,1
+ lrvh %r0,524287
+ lrvh %r0,0(%r1)
+ lrvh %r0,0(%r15)
+ lrvh %r0,524287(%r1,%r15)
+ lrvh %r0,524287(%r15,%r1)
+ lrvh %r15,0
+
#CHECK: lrvr %r0, %r0 # encoding: [0xb9,0x1f,0x00,0x00]
#CHECK: lrvr %r0, %r15 # encoding: [0xb9,0x1f,0x00,0x0f]
#CHECK: lrvr %r15, %r0 # encoding: [0xb9,0x1f,0x00,0xf0]
@@ -7797,6 +8594,26 @@
lt %r0, 524287(%r15,%r1)
lt %r15, 0
+#CHECK: ltdbr %f0, %f9 # encoding: [0xb3,0x12,0x00,0x09]
+#CHECK: ltdbr %f0, %f15 # encoding: [0xb3,0x12,0x00,0x0f]
+#CHECK: ltdbr %f15, %f0 # encoding: [0xb3,0x12,0x00,0xf0]
+#CHECK: ltdbr %f15, %f9 # encoding: [0xb3,0x12,0x00,0xf9]
+
+ ltdbr %f0,%f9
+ ltdbr %f0,%f15
+ ltdbr %f15,%f0
+ ltdbr %f15,%f9
+
+#CHECK: ltebr %f0, %f9 # encoding: [0xb3,0x02,0x00,0x09]
+#CHECK: ltebr %f0, %f15 # encoding: [0xb3,0x02,0x00,0x0f]
+#CHECK: ltebr %f15, %f0 # encoding: [0xb3,0x02,0x00,0xf0]
+#CHECK: ltebr %f15, %f9 # encoding: [0xb3,0x02,0x00,0xf9]
+
+ ltebr %f0,%f9
+ ltebr %f0,%f15
+ ltebr %f15,%f0
+ ltebr %f15,%f9
+
#CHECK: ltg %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x02]
#CHECK: ltg %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x02]
#CHECK: ltg %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x02]
@@ -7841,26 +8658,6 @@
ltgf %r0, 524287(%r15,%r1)
ltgf %r15, 0
-#CHECK: ltdbr %f0, %f9 # encoding: [0xb3,0x12,0x00,0x09]
-#CHECK: ltdbr %f0, %f15 # encoding: [0xb3,0x12,0x00,0x0f]
-#CHECK: ltdbr %f15, %f0 # encoding: [0xb3,0x12,0x00,0xf0]
-#CHECK: ltdbr %f15, %f9 # encoding: [0xb3,0x12,0x00,0xf9]
-
- ltdbr %f0,%f9
- ltdbr %f0,%f15
- ltdbr %f15,%f0
- ltdbr %f15,%f9
-
-#CHECK: ltebr %f0, %f9 # encoding: [0xb3,0x02,0x00,0x09]
-#CHECK: ltebr %f0, %f15 # encoding: [0xb3,0x02,0x00,0x0f]
-#CHECK: ltebr %f15, %f0 # encoding: [0xb3,0x02,0x00,0xf0]
-#CHECK: ltebr %f15, %f9 # encoding: [0xb3,0x02,0x00,0xf9]
-
- ltebr %f0,%f9
- ltebr %f0,%f15
- ltebr %f15,%f0
- ltebr %f15,%f9
-
#CHECK: ltgfr %r0, %r9 # encoding: [0xb9,0x12,0x00,0x09]
#CHECK: ltgfr %r0, %r15 # encoding: [0xb9,0x12,0x00,0x0f]
#CHECK: ltgfr %r15, %r0 # encoding: [0xb9,0x12,0x00,0xf0]
@@ -7957,6 +8754,22 @@
lzxr %f8
lzxr %f13
+#CHECK: m %r0, 0 # encoding: [0x5c,0x00,0x00,0x00]
+#CHECK: m %r0, 4095 # encoding: [0x5c,0x00,0x0f,0xff]
+#CHECK: m %r0, 0(%r1) # encoding: [0x5c,0x00,0x10,0x00]
+#CHECK: m %r0, 0(%r15) # encoding: [0x5c,0x00,0xf0,0x00]
+#CHECK: m %r0, 4095(%r1,%r15) # encoding: [0x5c,0x01,0xff,0xff]
+#CHECK: m %r0, 4095(%r15,%r1) # encoding: [0x5c,0x0f,0x1f,0xff]
+#CHECK: m %r14, 0 # encoding: [0x5c,0xe0,0x00,0x00]
+
+ m %r0, 0
+ m %r0, 4095
+ m %r0, 0(%r1)
+ m %r0, 0(%r15)
+ m %r0, 4095(%r1,%r15)
+ m %r0, 4095(%r15,%r1)
+ m %r14, 0
+
#CHECK: madb %f0, %f0, 0 # encoding: [0xed,0x00,0x00,0x00,0x00,0x1e]
#CHECK: madb %f0, %f0, 4095 # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1e]
#CHECK: madb %f0, %f0, 0(%r1) # encoding: [0xed,0x00,0x10,0x00,0x00,0x1e]
@@ -8025,6 +8838,22 @@
maebr %f7, %f8, %f9
maebr %f15, %f15, %f15
+#CHECK: mc 0, 0 # encoding: [0xaf,0x00,0x00,0x00]
+#CHECK: mc 4095, 0 # encoding: [0xaf,0x00,0x0f,0xff]
+#CHECK: mc 0, 255 # encoding: [0xaf,0xff,0x00,0x00]
+#CHECK: mc 0(%r1), 42 # encoding: [0xaf,0x2a,0x10,0x00]
+#CHECK: mc 0(%r15), 42 # encoding: [0xaf,0x2a,0xf0,0x00]
+#CHECK: mc 4095(%r1), 42 # encoding: [0xaf,0x2a,0x1f,0xff]
+#CHECK: mc 4095(%r15), 42 # encoding: [0xaf,0x2a,0xff,0xff]
+
+ mc 0, 0
+ mc 4095, 0
+ mc 0, 255
+ mc 0(%r1), 42
+ mc 0(%r15), 42
+ mc 4095(%r1), 42
+ mc 4095(%r15), 42
+
#CHECK: mdb %f0, 0 # encoding: [0xed,0x00,0x00,0x00,0x00,0x1c]
#CHECK: mdb %f0, 4095 # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1c]
#CHECK: mdb %f0, 0(%r1) # encoding: [0xed,0x00,0x10,0x00,0x00,0x1c]
@@ -8103,6 +8932,28 @@
meebr %f7, %f8
meebr %f15, %f0
+#CHECK: mfy %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x5c]
+#CHECK: mfy %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x5c]
+#CHECK: mfy %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x5c]
+#CHECK: mfy %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x5c]
+#CHECK: mfy %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x5c]
+#CHECK: mfy %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x5c]
+#CHECK: mfy %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x5c]
+#CHECK: mfy %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x5c]
+#CHECK: mfy %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x5c]
+#CHECK: mfy %r14, 0 # encoding: [0xe3,0xe0,0x00,0x00,0x00,0x5c]
+
+ mfy %r0, -524288
+ mfy %r0, -1
+ mfy %r0, 0
+ mfy %r0, 1
+ mfy %r0, 524287
+ mfy %r0, 0(%r1)
+ mfy %r0, 0(%r15)
+ mfy %r0, 524287(%r1,%r15)
+ mfy %r0, 524287(%r15,%r1)
+ mfy %r14, 0
+
#CHECK: mghi %r0, -32768 # encoding: [0xa7,0x0d,0x80,0x00]
#CHECK: mghi %r0, -1 # encoding: [0xa7,0x0d,0xff,0xff]
#CHECK: mghi %r0, 0 # encoding: [0xa7,0x0d,0x00,0x00]
@@ -8169,6 +9020,28 @@
mhy %r0, 524287(%r15,%r1)
mhy %r15, 0
+#CHECK: ml %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x96]
+#CHECK: ml %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x96]
+#CHECK: ml %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x96]
+#CHECK: ml %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x96]
+#CHECK: ml %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x96]
+#CHECK: ml %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x96]
+#CHECK: ml %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x96]
+#CHECK: ml %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x96]
+#CHECK: ml %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x96]
+#CHECK: ml %r14, 0 # encoding: [0xe3,0xe0,0x00,0x00,0x00,0x96]
+
+ ml %r0, -524288
+ ml %r0, -1
+ ml %r0, 0
+ ml %r0, 1
+ ml %r0, 524287
+ ml %r0, 0(%r1)
+ ml %r0, 0(%r15)
+ ml %r0, 524287(%r1,%r15)
+ ml %r0, 524287(%r15,%r1)
+ ml %r14, 0
+
#CHECK: mlg %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x86]
#CHECK: mlg %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x86]
#CHECK: mlg %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x86]
@@ -8201,6 +9074,56 @@
mlgr %r14,%r0
mlgr %r6,%r9
+#CHECK: mlr %r0, %r0 # encoding: [0xb9,0x96,0x00,0x00]
+#CHECK: mlr %r0, %r15 # encoding: [0xb9,0x96,0x00,0x0f]
+#CHECK: mlr %r14, %r0 # encoding: [0xb9,0x96,0x00,0xe0]
+#CHECK: mlr %r6, %r9 # encoding: [0xb9,0x96,0x00,0x69]
+
+ mlr %r0,%r0
+ mlr %r0,%r15
+ mlr %r14,%r0
+ mlr %r6,%r9
+
+#CHECK: mp 0(1), 0(1) # encoding: [0xfc,0x00,0x00,0x00,0x00,0x00]
+#CHECK: mp 0(1), 0(1,%r1) # encoding: [0xfc,0x00,0x00,0x00,0x10,0x00]
+#CHECK: mp 0(1), 0(1,%r15) # encoding: [0xfc,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: mp 0(1), 4095(1) # encoding: [0xfc,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: mp 0(1), 4095(1,%r1) # encoding: [0xfc,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: mp 0(1), 4095(1,%r15) # encoding: [0xfc,0x00,0x00,0x00,0xff,0xff]
+#CHECK: mp 0(1,%r1), 0(1) # encoding: [0xfc,0x00,0x10,0x00,0x00,0x00]
+#CHECK: mp 0(1,%r15), 0(1) # encoding: [0xfc,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: mp 4095(1,%r1), 0(1) # encoding: [0xfc,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: mp 4095(1,%r15), 0(1) # encoding: [0xfc,0x00,0xff,0xff,0x00,0x00]
+#CHECK: mp 0(16,%r1), 0(1) # encoding: [0xfc,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: mp 0(16,%r15), 0(1) # encoding: [0xfc,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: mp 0(1), 0(16,%r1) # encoding: [0xfc,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: mp 0(1), 0(16,%r15) # encoding: [0xfc,0x0f,0x00,0x00,0xf0,0x00]
+
+ mp 0(1), 0(1)
+ mp 0(1), 0(1,%r1)
+ mp 0(1), 0(1,%r15)
+ mp 0(1), 4095(1)
+ mp 0(1), 4095(1,%r1)
+ mp 0(1), 4095(1,%r15)
+ mp 0(1,%r1), 0(1)
+ mp 0(1,%r15), 0(1)
+ mp 4095(1,%r1), 0(1)
+ mp 4095(1,%r15), 0(1)
+ mp 0(16,%r1), 0(1)
+ mp 0(16,%r15), 0(1)
+ mp 0(1), 0(16,%r1)
+ mp 0(1), 0(16,%r15)
+
+#CHECK: mr %r0, %r0 # encoding: [0x1c,0x00]
+#CHECK: mr %r0, %r15 # encoding: [0x1c,0x0f]
+#CHECK: mr %r14, %r0 # encoding: [0x1c,0xe0]
+#CHECK: mr %r6, %r9 # encoding: [0x1c,0x69]
+
+ mr %r0,%r0
+ mr %r0,%r15
+ mr %r14,%r0
+ mr %r6,%r9
+
#CHECK: ms %r0, 0 # encoding: [0x71,0x00,0x00,0x00]
#CHECK: ms %r0, 4095 # encoding: [0x71,0x00,0x0f,0xff]
#CHECK: ms %r0, 0(%r1) # encoding: [0x71,0x00,0x10,0x00]
@@ -8435,6 +9358,32 @@
mvc 0(256,%r1), 0
mvc 0(256,%r15), 0
+#CHECK: mvcin 0(1), 0 # encoding: [0xe8,0x00,0x00,0x00,0x00,0x00]
+#CHECK: mvcin 0(1), 0(%r1) # encoding: [0xe8,0x00,0x00,0x00,0x10,0x00]
+#CHECK: mvcin 0(1), 0(%r15) # encoding: [0xe8,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: mvcin 0(1), 4095 # encoding: [0xe8,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: mvcin 0(1), 4095(%r1) # encoding: [0xe8,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: mvcin 0(1), 4095(%r15) # encoding: [0xe8,0x00,0x00,0x00,0xff,0xff]
+#CHECK: mvcin 0(1,%r1), 0 # encoding: [0xe8,0x00,0x10,0x00,0x00,0x00]
+#CHECK: mvcin 0(1,%r15), 0 # encoding: [0xe8,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: mvcin 4095(1,%r1), 0 # encoding: [0xe8,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: mvcin 4095(1,%r15), 0 # encoding: [0xe8,0x00,0xff,0xff,0x00,0x00]
+#CHECK: mvcin 0(256,%r1), 0 # encoding: [0xe8,0xff,0x10,0x00,0x00,0x00]
+#CHECK: mvcin 0(256,%r15), 0 # encoding: [0xe8,0xff,0xf0,0x00,0x00,0x00]
+
+ mvcin 0(1), 0
+ mvcin 0(1), 0(%r1)
+ mvcin 0(1), 0(%r15)
+ mvcin 0(1), 4095
+ mvcin 0(1), 4095(%r1)
+ mvcin 0(1), 4095(%r15)
+ mvcin 0(1,%r1), 0
+ mvcin 0(1,%r15), 0
+ mvcin 4095(1,%r1), 0
+ mvcin 4095(1,%r15), 0
+ mvcin 0(256,%r1), 0
+ mvcin 0(256,%r15), 0
+
#CHECK: mvck 0(%r0), 0, %r3 # encoding: [0xd9,0x03,0x00,0x00,0x00,0x00]
#CHECK: mvck 0(%r1), 0, %r3 # encoding: [0xd9,0x13,0x00,0x00,0x00,0x00]
#CHECK: mvck 0(%r1), 0(%r1), %r3 # encoding: [0xd9,0x13,0x00,0x00,0x10,0x00]
@@ -8463,6 +9412,54 @@
mvck 0(%r2,%r1), 0, %r3
mvck 0(%r2,%r15), 0, %r3
+#CHECK: mvcl %r0, %r8 # encoding: [0x0e,0x08]
+#CHECK: mvcl %r0, %r14 # encoding: [0x0e,0x0e]
+#CHECK: mvcl %r14, %r0 # encoding: [0x0e,0xe0]
+#CHECK: mvcl %r14, %r8 # encoding: [0x0e,0xe8]
+
+ mvcl %r0, %r8
+ mvcl %r0, %r14
+ mvcl %r14, %r0
+ mvcl %r14, %r8
+
+#CHECK: mvcle %r0, %r0, 0 # encoding: [0xa8,0x00,0x00,0x00]
+#CHECK: mvcle %r0, %r14, 4095 # encoding: [0xa8,0x0e,0x0f,0xff]
+#CHECK: mvcle %r0, %r0, 0(%r1) # encoding: [0xa8,0x00,0x10,0x00]
+#CHECK: mvcle %r0, %r0, 0(%r15) # encoding: [0xa8,0x00,0xf0,0x00]
+#CHECK: mvcle %r14, %r14, 4095(%r1) # encoding: [0xa8,0xee,0x1f,0xff]
+#CHECK: mvcle %r0, %r0, 4095(%r15) # encoding: [0xa8,0x00,0xff,0xff]
+#CHECK: mvcle %r14, %r0, 0 # encoding: [0xa8,0xe0,0x00,0x00]
+
+ mvcle %r0, %r0, 0
+ mvcle %r0, %r14, 4095
+ mvcle %r0, %r0, 0(%r1)
+ mvcle %r0, %r0, 0(%r15)
+ mvcle %r14, %r14, 4095(%r1)
+ mvcle %r0, %r0, 4095(%r15)
+ mvcle %r14, %r0, 0
+
+#CHECK: mvclu %r0, %r0, -524288 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x8e]
+#CHECK: mvclu %r0, %r0, -1 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x8e]
+#CHECK: mvclu %r0, %r14, 0 # encoding: [0xeb,0x0e,0x00,0x00,0x00,0x8e]
+#CHECK: mvclu %r0, %r14, 1 # encoding: [0xeb,0x0e,0x00,0x01,0x00,0x8e]
+#CHECK: mvclu %r0, %r8, 524287 # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x8e]
+#CHECK: mvclu %r0, %r8, 0(%r1) # encoding: [0xeb,0x08,0x10,0x00,0x00,0x8e]
+#CHECK: mvclu %r0, %r4, 0(%r15) # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x8e]
+#CHECK: mvclu %r0, %r4, 524287(%r15) # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x8e]
+#CHECK: mvclu %r0, %r0, 524287(%r1) # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x8e]
+#CHECK: mvclu %r14, %r0, 0 # encoding: [0xeb,0xe0,0x00,0x00,0x00,0x8e]
+
+ mvclu %r0, %r0, -524288
+ mvclu %r0, %r0, -1
+ mvclu %r0, %r14, 0
+ mvclu %r0, %r14, 1
+ mvclu %r0, %r8, 524287
+ mvclu %r0, %r8, 0(%r1)
+ mvclu %r0, %r4, 0(%r15)
+ mvclu %r0, %r4, 524287(%r15)
+ mvclu %r0, %r0, 524287(%r1)
+ mvclu %r14, %r0, 0
+
#CHECK: mvghi 0, 0 # encoding: [0xe5,0x48,0x00,0x00,0x00,0x00]
#CHECK: mvghi 4095, 0 # encoding: [0xe5,0x48,0x0f,0xff,0x00,0x00]
#CHECK: mvghi 0, -32768 # encoding: [0xe5,0x48,0x00,0x00,0x80,0x00]
@@ -8573,6 +9570,62 @@
mviy 524287(%r1), 42
mviy 524287(%r15), 42
+#CHECK: mvn 0(1), 0 # encoding: [0xd1,0x00,0x00,0x00,0x00,0x00]
+#CHECK: mvn 0(1), 0(%r1) # encoding: [0xd1,0x00,0x00,0x00,0x10,0x00]
+#CHECK: mvn 0(1), 0(%r15) # encoding: [0xd1,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: mvn 0(1), 4095 # encoding: [0xd1,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: mvn 0(1), 4095(%r1) # encoding: [0xd1,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: mvn 0(1), 4095(%r15) # encoding: [0xd1,0x00,0x00,0x00,0xff,0xff]
+#CHECK: mvn 0(1,%r1), 0 # encoding: [0xd1,0x00,0x10,0x00,0x00,0x00]
+#CHECK: mvn 0(1,%r15), 0 # encoding: [0xd1,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: mvn 4095(1,%r1), 0 # encoding: [0xd1,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: mvn 4095(1,%r15), 0 # encoding: [0xd1,0x00,0xff,0xff,0x00,0x00]
+#CHECK: mvn 0(256,%r1), 0 # encoding: [0xd1,0xff,0x10,0x00,0x00,0x00]
+#CHECK: mvn 0(256,%r15), 0 # encoding: [0xd1,0xff,0xf0,0x00,0x00,0x00]
+
+ mvn 0(1), 0
+ mvn 0(1), 0(%r1)
+ mvn 0(1), 0(%r15)
+ mvn 0(1), 4095
+ mvn 0(1), 4095(%r1)
+ mvn 0(1), 4095(%r15)
+ mvn 0(1,%r1), 0
+ mvn 0(1,%r15), 0
+ mvn 4095(1,%r1), 0
+ mvn 4095(1,%r15), 0
+ mvn 0(256,%r1), 0
+ mvn 0(256,%r15), 0
+
+#CHECK: mvo 0(1), 0(1) # encoding: [0xf1,0x00,0x00,0x00,0x00,0x00]
+#CHECK: mvo 0(1), 0(1,%r1) # encoding: [0xf1,0x00,0x00,0x00,0x10,0x00]
+#CHECK: mvo 0(1), 0(1,%r15) # encoding: [0xf1,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: mvo 0(1), 4095(1) # encoding: [0xf1,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: mvo 0(1), 4095(1,%r1) # encoding: [0xf1,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: mvo 0(1), 4095(1,%r15) # encoding: [0xf1,0x00,0x00,0x00,0xff,0xff]
+#CHECK: mvo 0(1,%r1), 0(1) # encoding: [0xf1,0x00,0x10,0x00,0x00,0x00]
+#CHECK: mvo 0(1,%r15), 0(1) # encoding: [0xf1,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: mvo 4095(1,%r1), 0(1) # encoding: [0xf1,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: mvo 4095(1,%r15), 0(1) # encoding: [0xf1,0x00,0xff,0xff,0x00,0x00]
+#CHECK: mvo 0(16,%r1), 0(1) # encoding: [0xf1,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: mvo 0(16,%r15), 0(1) # encoding: [0xf1,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: mvo 0(1), 0(16,%r1) # encoding: [0xf1,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: mvo 0(1), 0(16,%r15) # encoding: [0xf1,0x0f,0x00,0x00,0xf0,0x00]
+
+ mvo 0(1), 0(1)
+ mvo 0(1), 0(1,%r1)
+ mvo 0(1), 0(1,%r15)
+ mvo 0(1), 4095(1)
+ mvo 0(1), 4095(1,%r1)
+ mvo 0(1), 4095(1,%r15)
+ mvo 0(1,%r1), 0(1)
+ mvo 0(1,%r15), 0(1)
+ mvo 4095(1,%r1), 0(1)
+ mvo 4095(1,%r15), 0(1)
+ mvo 0(16,%r1), 0(1)
+ mvo 0(16,%r15), 0(1)
+ mvo 0(1), 0(16,%r1)
+ mvo 0(1), 0(16,%r15)
+
#CHECK: mvst %r0, %r0 # encoding: [0xb2,0x55,0x00,0x00]
#CHECK: mvst %r0, %r15 # encoding: [0xb2,0x55,0x00,0x0f]
#CHECK: mvst %r15, %r0 # encoding: [0xb2,0x55,0x00,0xf0]
@@ -8583,6 +9636,32 @@
mvst %r15,%r0
mvst %r7,%r8
+#CHECK: mvz 0(1), 0 # encoding: [0xd3,0x00,0x00,0x00,0x00,0x00]
+#CHECK: mvz 0(1), 0(%r1) # encoding: [0xd3,0x00,0x00,0x00,0x10,0x00]
+#CHECK: mvz 0(1), 0(%r15) # encoding: [0xd3,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: mvz 0(1), 4095 # encoding: [0xd3,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: mvz 0(1), 4095(%r1) # encoding: [0xd3,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: mvz 0(1), 4095(%r15) # encoding: [0xd3,0x00,0x00,0x00,0xff,0xff]
+#CHECK: mvz 0(1,%r1), 0 # encoding: [0xd3,0x00,0x10,0x00,0x00,0x00]
+#CHECK: mvz 0(1,%r15), 0 # encoding: [0xd3,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: mvz 4095(1,%r1), 0 # encoding: [0xd3,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: mvz 4095(1,%r15), 0 # encoding: [0xd3,0x00,0xff,0xff,0x00,0x00]
+#CHECK: mvz 0(256,%r1), 0 # encoding: [0xd3,0xff,0x10,0x00,0x00,0x00]
+#CHECK: mvz 0(256,%r15), 0 # encoding: [0xd3,0xff,0xf0,0x00,0x00,0x00]
+
+ mvz 0(1), 0
+ mvz 0(1), 0(%r1)
+ mvz 0(1), 0(%r15)
+ mvz 0(1), 4095
+ mvz 0(1), 4095(%r1)
+ mvz 0(1), 4095(%r15)
+ mvz 0(1,%r1), 0
+ mvz 0(1,%r15), 0
+ mvz 4095(1,%r1), 0
+ mvz 4095(1,%r15), 0
+ mvz 0(256,%r1), 0
+ mvz 0(256,%r15), 0
+
#CHECK: mxbr %f0, %f0 # encoding: [0xb3,0x4c,0x00,0x00]
#CHECK: mxbr %f0, %f13 # encoding: [0xb3,0x4c,0x00,0x0d]
#CHECK: mxbr %f8, %f5 # encoding: [0xb3,0x4c,0x00,0x85]
@@ -9025,6 +10104,36 @@
oy %r0, 524287(%r15,%r1)
oy %r15, 0
+#CHECK: pack 0(1), 0(1) # encoding: [0xf2,0x00,0x00,0x00,0x00,0x00]
+#CHECK: pack 0(1), 0(1,%r1) # encoding: [0xf2,0x00,0x00,0x00,0x10,0x00]
+#CHECK: pack 0(1), 0(1,%r15) # encoding: [0xf2,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: pack 0(1), 4095(1) # encoding: [0xf2,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: pack 0(1), 4095(1,%r1) # encoding: [0xf2,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: pack 0(1), 4095(1,%r15) # encoding: [0xf2,0x00,0x00,0x00,0xff,0xff]
+#CHECK: pack 0(1,%r1), 0(1) # encoding: [0xf2,0x00,0x10,0x00,0x00,0x00]
+#CHECK: pack 0(1,%r15), 0(1) # encoding: [0xf2,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: pack 4095(1,%r1), 0(1) # encoding: [0xf2,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: pack 4095(1,%r15), 0(1) # encoding: [0xf2,0x00,0xff,0xff,0x00,0x00]
+#CHECK: pack 0(16,%r1), 0(1) # encoding: [0xf2,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: pack 0(16,%r15), 0(1) # encoding: [0xf2,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: pack 0(1), 0(16,%r1) # encoding: [0xf2,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: pack 0(1), 0(16,%r15) # encoding: [0xf2,0x0f,0x00,0x00,0xf0,0x00]
+
+ pack 0(1), 0(1)
+ pack 0(1), 0(1,%r1)
+ pack 0(1), 0(1,%r15)
+ pack 0(1), 4095(1)
+ pack 0(1), 4095(1,%r1)
+ pack 0(1), 4095(1,%r15)
+ pack 0(1,%r1), 0(1)
+ pack 0(1,%r15), 0(1)
+ pack 4095(1,%r1), 0(1)
+ pack 4095(1,%r15), 0(1)
+ pack 0(16,%r1), 0(1)
+ pack 0(16,%r15), 0(1)
+ pack 0(1), 0(16,%r1)
+ pack 0(1), 0(16,%r15)
+
#CHECK: pfd 0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x36]
#CHECK: pfd 0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x36]
#CHECK: pfd 0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x36]
@@ -9084,6 +10193,58 @@
pfdrl 7, frob@PLT
pfdrl 8, frob@PLT
+#CHECK: pka 0, 0(1) # encoding: [0xe9,0x00,0x00,0x00,0x00,0x00]
+#CHECK: pka 0, 0(1,%r1) # encoding: [0xe9,0x00,0x00,0x00,0x10,0x00]
+#CHECK: pka 0, 0(1,%r15) # encoding: [0xe9,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: pka 0, 4095(1) # encoding: [0xe9,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: pka 0, 4095(1,%r1) # encoding: [0xe9,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: pka 0, 4095(1,%r15) # encoding: [0xe9,0x00,0x00,0x00,0xff,0xff]
+#CHECK: pka 0(%r1), 0(1) # encoding: [0xe9,0x00,0x10,0x00,0x00,0x00]
+#CHECK: pka 0(%r15), 0(1) # encoding: [0xe9,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: pka 4095(%r1), 0(1) # encoding: [0xe9,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: pka 4095(%r15), 0(1) # encoding: [0xe9,0x00,0xff,0xff,0x00,0x00]
+#CHECK: pka 0, 0(256,%r1) # encoding: [0xe9,0xff,0x00,0x00,0x10,0x00]
+#CHECK: pka 0, 0(256,%r15) # encoding: [0xe9,0xff,0x00,0x00,0xf0,0x00]
+
+ pka 0, 0(1)
+ pka 0, 0(1,%r1)
+ pka 0, 0(1,%r15)
+ pka 0, 4095(1)
+ pka 0, 4095(1,%r1)
+ pka 0, 4095(1,%r15)
+ pka 0(%r1), 0(1)
+ pka 0(%r15), 0(1)
+ pka 4095(%r1), 0(1)
+ pka 4095(%r15), 0(1)
+ pka 0, 0(256,%r1)
+ pka 0, 0(256,%r15)
+
+#CHECK: pku 0, 0(1) # encoding: [0xe1,0x00,0x00,0x00,0x00,0x00]
+#CHECK: pku 0, 0(1,%r1) # encoding: [0xe1,0x00,0x00,0x00,0x10,0x00]
+#CHECK: pku 0, 0(1,%r15) # encoding: [0xe1,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: pku 0, 4095(1) # encoding: [0xe1,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: pku 0, 4095(1,%r1) # encoding: [0xe1,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: pku 0, 4095(1,%r15) # encoding: [0xe1,0x00,0x00,0x00,0xff,0xff]
+#CHECK: pku 0(%r1), 0(1) # encoding: [0xe1,0x00,0x10,0x00,0x00,0x00]
+#CHECK: pku 0(%r15), 0(1) # encoding: [0xe1,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: pku 4095(%r1), 0(1) # encoding: [0xe1,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: pku 4095(%r15), 0(1) # encoding: [0xe1,0x00,0xff,0xff,0x00,0x00]
+#CHECK: pku 0, 0(256,%r1) # encoding: [0xe1,0xff,0x00,0x00,0x10,0x00]
+#CHECK: pku 0, 0(256,%r15) # encoding: [0xe1,0xff,0x00,0x00,0xf0,0x00]
+
+ pku 0, 0(1)
+ pku 0, 0(1,%r1)
+ pku 0, 0(1,%r15)
+ pku 0, 4095(1)
+ pku 0, 4095(1,%r1)
+ pku 0, 4095(1,%r15)
+ pku 0(%r1), 0(1)
+ pku 0(%r15), 0(1)
+ pku 4095(%r1), 0(1)
+ pku 4095(%r15), 0(1)
+ pku 0, 0(256,%r1)
+ pku 0, 0(256,%r15)
+
#CHECK: plo %r0, 0, %r0, 0 # encoding: [0xee,0x00,0x00,0x00,0x00,0x00]
#CHECK: plo %r2, 0(%r1), %r4, 0(%r15) # encoding: [0xee,0x24,0x10,0x00,0xf0,0x00]
#CHECK: plo %r2, 1(%r1), %r4, 0(%r15) # encoding: [0xee,0x24,0x10,0x01,0xf0,0x00]
@@ -9117,54 +10278,6 @@
risbg %r15,%r0,0,0,0
risbg %r4,%r5,6,7,8
-#CHECK: rnsbg %r0, %r0, 0, 0, 0 # encoding: [0xec,0x00,0x00,0x00,0x00,0x54]
-#CHECK: rnsbg %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x54]
-#CHECK: rnsbg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x54]
-#CHECK: rnsbg %r0, %r0, 255, 0, 0 # encoding: [0xec,0x00,0xff,0x00,0x00,0x54]
-#CHECK: rnsbg %r0, %r15, 0, 0, 0 # encoding: [0xec,0x0f,0x00,0x00,0x00,0x54]
-#CHECK: rnsbg %r15, %r0, 0, 0, 0 # encoding: [0xec,0xf0,0x00,0x00,0x00,0x54]
-#CHECK: rnsbg %r4, %r5, 6, 7, 8 # encoding: [0xec,0x45,0x06,0x07,0x08,0x54]
-
- rnsbg %r0,%r0,0,0,0
- rnsbg %r0,%r0,0,0,63
- rnsbg %r0,%r0,0,255,0
- rnsbg %r0,%r0,255,0,0
- rnsbg %r0,%r15,0,0,0
- rnsbg %r15,%r0,0,0,0
- rnsbg %r4,%r5,6,7,8
-
-#CHECK: rosbg %r0, %r0, 0, 0, 0 # encoding: [0xec,0x00,0x00,0x00,0x00,0x56]
-#CHECK: rosbg %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x56]
-#CHECK: rosbg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x56]
-#CHECK: rosbg %r0, %r0, 255, 0, 0 # encoding: [0xec,0x00,0xff,0x00,0x00,0x56]
-#CHECK: rosbg %r0, %r15, 0, 0, 0 # encoding: [0xec,0x0f,0x00,0x00,0x00,0x56]
-#CHECK: rosbg %r15, %r0, 0, 0, 0 # encoding: [0xec,0xf0,0x00,0x00,0x00,0x56]
-#CHECK: rosbg %r4, %r5, 6, 7, 8 # encoding: [0xec,0x45,0x06,0x07,0x08,0x56]
-
- rosbg %r0,%r0,0,0,0
- rosbg %r0,%r0,0,0,63
- rosbg %r0,%r0,0,255,0
- rosbg %r0,%r0,255,0,0
- rosbg %r0,%r15,0,0,0
- rosbg %r15,%r0,0,0,0
- rosbg %r4,%r5,6,7,8
-
-#CHECK: rxsbg %r0, %r0, 0, 0, 0 # encoding: [0xec,0x00,0x00,0x00,0x00,0x57]
-#CHECK: rxsbg %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x57]
-#CHECK: rxsbg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x57]
-#CHECK: rxsbg %r0, %r0, 255, 0, 0 # encoding: [0xec,0x00,0xff,0x00,0x00,0x57]
-#CHECK: rxsbg %r0, %r15, 0, 0, 0 # encoding: [0xec,0x0f,0x00,0x00,0x00,0x57]
-#CHECK: rxsbg %r15, %r0, 0, 0, 0 # encoding: [0xec,0xf0,0x00,0x00,0x00,0x57]
-#CHECK: rxsbg %r4, %r5, 6, 7, 8 # encoding: [0xec,0x45,0x06,0x07,0x08,0x57]
-
- rxsbg %r0,%r0,0,0,0
- rxsbg %r0,%r0,0,0,63
- rxsbg %r0,%r0,0,255,0
- rxsbg %r0,%r0,255,0,0
- rxsbg %r0,%r15,0,0,0
- rxsbg %r15,%r0,0,0,0
- rxsbg %r4,%r5,6,7,8
-
#CHECK: rll %r0, %r0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0x1d]
#CHECK: rll %r15, %r1, 0 # encoding: [0xeb,0xf1,0x00,0x00,0x00,0x1d]
#CHECK: rll %r1, %r15, 0 # encoding: [0xeb,0x1f,0x00,0x00,0x00,0x1d]
@@ -9217,6 +10330,54 @@
rllg %r0,%r0,524287(%r1)
rllg %r0,%r0,524287(%r15)
+#CHECK: rnsbg %r0, %r0, 0, 0, 0 # encoding: [0xec,0x00,0x00,0x00,0x00,0x54]
+#CHECK: rnsbg %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x54]
+#CHECK: rnsbg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x54]
+#CHECK: rnsbg %r0, %r0, 255, 0, 0 # encoding: [0xec,0x00,0xff,0x00,0x00,0x54]
+#CHECK: rnsbg %r0, %r15, 0, 0, 0 # encoding: [0xec,0x0f,0x00,0x00,0x00,0x54]
+#CHECK: rnsbg %r15, %r0, 0, 0, 0 # encoding: [0xec,0xf0,0x00,0x00,0x00,0x54]
+#CHECK: rnsbg %r4, %r5, 6, 7, 8 # encoding: [0xec,0x45,0x06,0x07,0x08,0x54]
+
+ rnsbg %r0,%r0,0,0,0
+ rnsbg %r0,%r0,0,0,63
+ rnsbg %r0,%r0,0,255,0
+ rnsbg %r0,%r0,255,0,0
+ rnsbg %r0,%r15,0,0,0
+ rnsbg %r15,%r0,0,0,0
+ rnsbg %r4,%r5,6,7,8
+
+#CHECK: rosbg %r0, %r0, 0, 0, 0 # encoding: [0xec,0x00,0x00,0x00,0x00,0x56]
+#CHECK: rosbg %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x56]
+#CHECK: rosbg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x56]
+#CHECK: rosbg %r0, %r0, 255, 0, 0 # encoding: [0xec,0x00,0xff,0x00,0x00,0x56]
+#CHECK: rosbg %r0, %r15, 0, 0, 0 # encoding: [0xec,0x0f,0x00,0x00,0x00,0x56]
+#CHECK: rosbg %r15, %r0, 0, 0, 0 # encoding: [0xec,0xf0,0x00,0x00,0x00,0x56]
+#CHECK: rosbg %r4, %r5, 6, 7, 8 # encoding: [0xec,0x45,0x06,0x07,0x08,0x56]
+
+ rosbg %r0,%r0,0,0,0
+ rosbg %r0,%r0,0,0,63
+ rosbg %r0,%r0,0,255,0
+ rosbg %r0,%r0,255,0,0
+ rosbg %r0,%r15,0,0,0
+ rosbg %r15,%r0,0,0,0
+ rosbg %r4,%r5,6,7,8
+
+#CHECK: rxsbg %r0, %r0, 0, 0, 0 # encoding: [0xec,0x00,0x00,0x00,0x00,0x57]
+#CHECK: rxsbg %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x57]
+#CHECK: rxsbg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x57]
+#CHECK: rxsbg %r0, %r0, 255, 0, 0 # encoding: [0xec,0x00,0xff,0x00,0x00,0x57]
+#CHECK: rxsbg %r0, %r15, 0, 0, 0 # encoding: [0xec,0x0f,0x00,0x00,0x00,0x57]
+#CHECK: rxsbg %r15, %r0, 0, 0, 0 # encoding: [0xec,0xf0,0x00,0x00,0x00,0x57]
+#CHECK: rxsbg %r4, %r5, 6, 7, 8 # encoding: [0xec,0x45,0x06,0x07,0x08,0x57]
+
+ rxsbg %r0,%r0,0,0,0
+ rxsbg %r0,%r0,0,0,63
+ rxsbg %r0,%r0,0,255,0
+ rxsbg %r0,%r0,255,0,0
+ rxsbg %r0,%r15,0,0,0
+ rxsbg %r15,%r0,0,0,0
+ rxsbg %r4,%r5,6,7,8
+
#CHECK: s %r0, 0 # encoding: [0x5b,0x00,0x00,0x00]
#CHECK: s %r0, 4095 # encoding: [0x5b,0x00,0x0f,0xff]
#CHECK: s %r0, 0(%r1) # encoding: [0x5b,0x00,0x10,0x00]
@@ -9233,6 +10394,14 @@
s %r0, 4095(%r15,%r1)
s %r15, 0
+#CHECK: sam24 # encoding: [0x01,0x0c]
+#CHECK: sam31 # encoding: [0x01,0x0d]
+#CHECK: sam64 # encoding: [0x01,0x0e]
+
+ sam24
+ sam31
+ sam64
+
#CHECK: sar %a0, %r0 # encoding: [0xb2,0x4e,0x00,0x00]
#CHECK: sar %a0, %r15 # encoding: [0xb2,0x4e,0x00,0x0f]
#CHECK: sar %a15, %r0 # encoding: [0xb2,0x4e,0x00,0xf0]
@@ -9245,14 +10414,6 @@
sar %a7, %r8
sar %a15, %r15
-#CHECK: sam24 # encoding: [0x01,0x0c]
-#CHECK: sam31 # encoding: [0x01,0x0d]
-#CHECK: sam64 # encoding: [0x01,0x0e]
-
- sam24
- sam31
- sam64
-
#CHECK: sdb %f0, 0 # encoding: [0xed,0x00,0x00,0x00,0x00,0x1b]
#CHECK: sdb %f0, 4095 # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1b]
#CHECK: sdb %f0, 0(%r1) # encoding: [0xed,0x00,0x10,0x00,0x00,0x1b]
@@ -9439,6 +10600,50 @@
sl %r0, 4095(%r15,%r1)
sl %r15, 0
+#CHECK: sla %r0, 0 # encoding: [0x8b,0x00,0x00,0x00]
+#CHECK: sla %r7, 0 # encoding: [0x8b,0x70,0x00,0x00]
+#CHECK: sla %r15, 0 # encoding: [0x8b,0xf0,0x00,0x00]
+#CHECK: sla %r0, 4095 # encoding: [0x8b,0x00,0x0f,0xff]
+#CHECK: sla %r0, 0(%r1) # encoding: [0x8b,0x00,0x10,0x00]
+#CHECK: sla %r0, 0(%r15) # encoding: [0x8b,0x00,0xf0,0x00]
+#CHECK: sla %r0, 4095(%r1) # encoding: [0x8b,0x00,0x1f,0xff]
+#CHECK: sla %r0, 4095(%r15) # encoding: [0x8b,0x00,0xff,0xff]
+
+ sla %r0,0
+ sla %r7,0
+ sla %r15,0
+ sla %r0,4095
+ sla %r0,0(%r1)
+ sla %r0,0(%r15)
+ sla %r0,4095(%r1)
+ sla %r0,4095(%r15)
+
+#CHECK: slag %r0, %r0, 0 # encoding: [0xeb,0x00,0x00,0x00,0x00,0x0b]
+#CHECK: slag %r15, %r1, 0 # encoding: [0xeb,0xf1,0x00,0x00,0x00,0x0b]
+#CHECK: slag %r1, %r15, 0 # encoding: [0xeb,0x1f,0x00,0x00,0x00,0x0b]
+#CHECK: slag %r15, %r15, 0 # encoding: [0xeb,0xff,0x00,0x00,0x00,0x0b]
+#CHECK: slag %r0, %r0, -524288 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x0b]
+#CHECK: slag %r0, %r0, -1 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x0b]
+#CHECK: slag %r0, %r0, 1 # encoding: [0xeb,0x00,0x00,0x01,0x00,0x0b]
+#CHECK: slag %r0, %r0, 524287 # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0x0b]
+#CHECK: slag %r0, %r0, 0(%r1) # encoding: [0xeb,0x00,0x10,0x00,0x00,0x0b]
+#CHECK: slag %r0, %r0, 0(%r15) # encoding: [0xeb,0x00,0xf0,0x00,0x00,0x0b]
+#CHECK: slag %r0, %r0, 524287(%r1) # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x0b]
+#CHECK: slag %r0, %r0, 524287(%r15) # encoding: [0xeb,0x00,0xff,0xff,0x7f,0x0b]
+
+ slag %r0,%r0,0
+ slag %r15,%r1,0
+ slag %r1,%r15,0
+ slag %r15,%r15,0
+ slag %r0,%r0,-524288
+ slag %r0,%r0,-1
+ slag %r0,%r0,1
+ slag %r0,%r0,524287
+ slag %r0,%r0,0(%r1)
+ slag %r0,%r0,0(%r15)
+ slag %r0,%r0,524287(%r1)
+ slag %r0,%r0,524287(%r15)
+
#CHECK: slb %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x99]
#CHECK: slb %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x99]
#CHECK: slb %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x99]
@@ -9503,6 +10708,42 @@
slbr %r15,%r0
slbr %r7,%r8
+#CHECK: slda %r0, 0 # encoding: [0x8f,0x00,0x00,0x00]
+#CHECK: slda %r6, 0 # encoding: [0x8f,0x60,0x00,0x00]
+#CHECK: slda %r14, 0 # encoding: [0x8f,0xe0,0x00,0x00]
+#CHECK: slda %r0, 4095 # encoding: [0x8f,0x00,0x0f,0xff]
+#CHECK: slda %r0, 0(%r1) # encoding: [0x8f,0x00,0x10,0x00]
+#CHECK: slda %r0, 0(%r15) # encoding: [0x8f,0x00,0xf0,0x00]
+#CHECK: slda %r0, 4095(%r1) # encoding: [0x8f,0x00,0x1f,0xff]
+#CHECK: slda %r0, 4095(%r15) # encoding: [0x8f,0x00,0xff,0xff]
+
+ slda %r0,0
+ slda %r6,0
+ slda %r14,0
+ slda %r0,4095
+ slda %r0,0(%r1)
+ slda %r0,0(%r15)
+ slda %r0,4095(%r1)
+ slda %r0,4095(%r15)
+
+#CHECK: sldl %r0, 0 # encoding: [0x8d,0x00,0x00,0x00]
+#CHECK: sldl %r6, 0 # encoding: [0x8d,0x60,0x00,0x00]
+#CHECK: sldl %r14, 0 # encoding: [0x8d,0xe0,0x00,0x00]
+#CHECK: sldl %r0, 4095 # encoding: [0x8d,0x00,0x0f,0xff]
+#CHECK: sldl %r0, 0(%r1) # encoding: [0x8d,0x00,0x10,0x00]
+#CHECK: sldl %r0, 0(%r15) # encoding: [0x8d,0x00,0xf0,0x00]
+#CHECK: sldl %r0, 4095(%r1) # encoding: [0x8d,0x00,0x1f,0xff]
+#CHECK: sldl %r0, 4095(%r15) # encoding: [0x8d,0x00,0xff,0xff]
+
+ sldl %r0,0
+ sldl %r6,0
+ sldl %r14,0
+ sldl %r0,4095
+ sldl %r0,0(%r1)
+ sldl %r0,0(%r15)
+ sldl %r0,4095(%r1)
+ sldl %r0,4095(%r15)
+
#CHECK: slfi %r0, 0 # encoding: [0xc2,0x05,0x00,0x00,0x00,0x00]
#CHECK: slfi %r0, 4294967295 # encoding: [0xc2,0x05,0xff,0xff,0xff,0xff]
#CHECK: slfi %r15, 0 # encoding: [0xc2,0xf5,0x00,0x00,0x00,0x00]
@@ -9583,24 +10824,6 @@
slgr %r15,%r0
slgr %r7,%r8
-#CHECK: sla %r0, 0 # encoding: [0x8b,0x00,0x00,0x00]
-#CHECK: sla %r7, 0 # encoding: [0x8b,0x70,0x00,0x00]
-#CHECK: sla %r15, 0 # encoding: [0x8b,0xf0,0x00,0x00]
-#CHECK: sla %r0, 4095 # encoding: [0x8b,0x00,0x0f,0xff]
-#CHECK: sla %r0, 0(%r1) # encoding: [0x8b,0x00,0x10,0x00]
-#CHECK: sla %r0, 0(%r15) # encoding: [0x8b,0x00,0xf0,0x00]
-#CHECK: sla %r0, 4095(%r1) # encoding: [0x8b,0x00,0x1f,0xff]
-#CHECK: sla %r0, 4095(%r15) # encoding: [0x8b,0x00,0xff,0xff]
-
- sla %r0,0
- sla %r7,0
- sla %r15,0
- sla %r0,4095
- sla %r0,0(%r1)
- sla %r0,0(%r15)
- sla %r0,4095(%r1)
- sla %r0,4095(%r15)
-
#CHECK: sll %r0, 0 # encoding: [0x89,0x00,0x00,0x00]
#CHECK: sll %r7, 0 # encoding: [0x89,0x70,0x00,0x00]
#CHECK: sll %r15, 0 # encoding: [0x89,0xf0,0x00,0x00]
@@ -9677,6 +10900,36 @@
sly %r0, 524287(%r15,%r1)
sly %r15, 0
+#CHECK: sp 0(1), 0(1) # encoding: [0xfb,0x00,0x00,0x00,0x00,0x00]
+#CHECK: sp 0(1), 0(1,%r1) # encoding: [0xfb,0x00,0x00,0x00,0x10,0x00]
+#CHECK: sp 0(1), 0(1,%r15) # encoding: [0xfb,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: sp 0(1), 4095(1) # encoding: [0xfb,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: sp 0(1), 4095(1,%r1) # encoding: [0xfb,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: sp 0(1), 4095(1,%r15) # encoding: [0xfb,0x00,0x00,0x00,0xff,0xff]
+#CHECK: sp 0(1,%r1), 0(1) # encoding: [0xfb,0x00,0x10,0x00,0x00,0x00]
+#CHECK: sp 0(1,%r15), 0(1) # encoding: [0xfb,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: sp 4095(1,%r1), 0(1) # encoding: [0xfb,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: sp 4095(1,%r15), 0(1) # encoding: [0xfb,0x00,0xff,0xff,0x00,0x00]
+#CHECK: sp 0(16,%r1), 0(1) # encoding: [0xfb,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: sp 0(16,%r15), 0(1) # encoding: [0xfb,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: sp 0(1), 0(16,%r1) # encoding: [0xfb,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: sp 0(1), 0(16,%r15) # encoding: [0xfb,0x0f,0x00,0x00,0xf0,0x00]
+
+ sp 0(1), 0(1)
+ sp 0(1), 0(1,%r1)
+ sp 0(1), 0(1,%r15)
+ sp 0(1), 4095(1)
+ sp 0(1), 4095(1,%r1)
+ sp 0(1), 4095(1,%r15)
+ sp 0(1,%r1), 0(1)
+ sp 0(1,%r15), 0(1)
+ sp 4095(1,%r1), 0(1)
+ sp 4095(1,%r15), 0(1)
+ sp 0(16,%r1), 0(1)
+ sp 0(16,%r15), 0(1)
+ sp 0(1), 0(16,%r1)
+ sp 0(1), 0(16,%r15)
+
#CHECK: spm %r0 # encoding: [0x04,0x00]
#CHECK: spm %r1 # encoding: [0x04,0x10]
#CHECK: spm %r15 # encoding: [0x04,0xf0]
@@ -9801,6 +11054,42 @@
srag %r0,%r0,524287(%r1)
srag %r0,%r0,524287(%r15)
+#CHECK: srda %r0, 0 # encoding: [0x8e,0x00,0x00,0x00]
+#CHECK: srda %r6, 0 # encoding: [0x8e,0x60,0x00,0x00]
+#CHECK: srda %r14, 0 # encoding: [0x8e,0xe0,0x00,0x00]
+#CHECK: srda %r0, 4095 # encoding: [0x8e,0x00,0x0f,0xff]
+#CHECK: srda %r0, 0(%r1) # encoding: [0x8e,0x00,0x10,0x00]
+#CHECK: srda %r0, 0(%r15) # encoding: [0x8e,0x00,0xf0,0x00]
+#CHECK: srda %r0, 4095(%r1) # encoding: [0x8e,0x00,0x1f,0xff]
+#CHECK: srda %r0, 4095(%r15) # encoding: [0x8e,0x00,0xff,0xff]
+
+ srda %r0,0
+ srda %r6,0
+ srda %r14,0
+ srda %r0,4095
+ srda %r0,0(%r1)
+ srda %r0,0(%r15)
+ srda %r0,4095(%r1)
+ srda %r0,4095(%r15)
+
+#CHECK: srdl %r0, 0 # encoding: [0x8c,0x00,0x00,0x00]
+#CHECK: srdl %r6, 0 # encoding: [0x8c,0x60,0x00,0x00]
+#CHECK: srdl %r14, 0 # encoding: [0x8c,0xe0,0x00,0x00]
+#CHECK: srdl %r0, 4095 # encoding: [0x8c,0x00,0x0f,0xff]
+#CHECK: srdl %r0, 0(%r1) # encoding: [0x8c,0x00,0x10,0x00]
+#CHECK: srdl %r0, 0(%r15) # encoding: [0x8c,0x00,0xf0,0x00]
+#CHECK: srdl %r0, 4095(%r1) # encoding: [0x8c,0x00,0x1f,0xff]
+#CHECK: srdl %r0, 4095(%r15) # encoding: [0x8c,0x00,0xff,0xff]
+
+ srdl %r0,0
+ srdl %r6,0
+ srdl %r14,0
+ srdl %r0,4095
+ srdl %r0,0(%r1)
+ srdl %r0,0(%r15)
+ srdl %r0,4095(%r1)
+ srdl %r0,4095(%r15)
+
#CHECK: srl %r0, 0 # encoding: [0x88,0x00,0x00,0x00]
#CHECK: srl %r7, 0 # encoding: [0x88,0x70,0x00,0x00]
#CHECK: srl %r15, 0 # encoding: [0x88,0xf0,0x00,0x00]
@@ -9873,6 +11162,34 @@
srnmt 4095(%r1)
srnmt 4095(%r15)
+#CHECK: srp 0(1), 0, 0 # encoding: [0xf0,0x00,0x00,0x00,0x00,0x00]
+#CHECK: srp 0(1), 0, 15 # encoding: [0xf0,0x0f,0x00,0x00,0x00,0x00]
+#CHECK: srp 0(1), 0(%r1), 0 # encoding: [0xf0,0x00,0x00,0x00,0x10,0x00]
+#CHECK: srp 0(1), 0(%r15), 0 # encoding: [0xf0,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: srp 0(1), 4095, 0 # encoding: [0xf0,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: srp 0(1), 4095(%r1), 0 # encoding: [0xf0,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: srp 0(1), 4095(%r15), 0 # encoding: [0xf0,0x00,0x00,0x00,0xff,0xff]
+#CHECK: srp 0(1,%r1), 0, 0 # encoding: [0xf0,0x00,0x10,0x00,0x00,0x00]
+#CHECK: srp 0(1,%r15), 0, 0 # encoding: [0xf0,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: srp 4095(1,%r1), 0, 0 # encoding: [0xf0,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: srp 4095(1,%r15), 0, 0 # encoding: [0xf0,0x00,0xff,0xff,0x00,0x00]
+#CHECK: srp 0(16,%r1), 0, 0 # encoding: [0xf0,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: srp 0(16,%r15), 0, 0 # encoding: [0xf0,0xf0,0xf0,0x00,0x00,0x00]
+
+ srp 0(1), 0, 0
+ srp 0(1), 0, 15
+ srp 0(1), 0(%r1), 0
+ srp 0(1), 0(%r15), 0
+ srp 0(1), 4095, 0
+ srp 0(1), 4095(%r1), 0
+ srp 0(1), 4095(%r15), 0
+ srp 0(1,%r1), 0, 0
+ srp 0(1,%r15), 0, 0
+ srp 4095(1,%r1), 0, 0
+ srp 4095(1,%r15), 0, 0
+ srp 0(16,%r1), 0, 0
+ srp 0(16,%r15), 0, 0
+
#CHECK: srst %r0, %r0 # encoding: [0xb2,0x5e,0x00,0x00]
#CHECK: srst %r0, %r15 # encoding: [0xb2,0x5e,0x00,0x0f]
#CHECK: srst %r15, %r0 # encoding: [0xb2,0x5e,0x00,0xf0]
@@ -9883,6 +11200,16 @@
srst %r15,%r0
srst %r7,%r8
+#CHECK: srstu %r0, %r0 # encoding: [0xb9,0xbe,0x00,0x00]
+#CHECK: srstu %r0, %r15 # encoding: [0xb9,0xbe,0x00,0x0f]
+#CHECK: srstu %r15, %r0 # encoding: [0xb9,0xbe,0x00,0xf0]
+#CHECK: srstu %r7, %r8 # encoding: [0xb9,0xbe,0x00,0x78]
+
+ srstu %r0,%r0
+ srstu %r0,%r15
+ srstu %r15,%r0
+ srstu %r7,%r8
+
#CHECK: st %r0, 0 # encoding: [0x50,0x00,0x00,0x00]
#CHECK: st %r0, 4095 # encoding: [0x50,0x00,0x0f,0xff]
#CHECK: st %r0, 0(%r1) # encoding: [0x50,0x00,0x10,0x00]
@@ -9970,56 +11297,102 @@
#CHECK: stck 0(%r15) # encoding: [0xb2,0x05,0xf0,0x00]
#CHECK: stck 4095 # encoding: [0xb2,0x05,0x0f,0xff]
#CHECK: stck 4095(%r1) # encoding: [0xb2,0x05,0x1f,0xff]
-#CHECK: stck 4095(%r15) # encoding: [0xb2,0x05,0xff,0xff]
+#CHECK: stck 4095(%r15) # encoding: [0xb2,0x05,0xff,0xff]
stck 0
stck 0(%r1)
stck 0(%r15)
- stck 4095
+ stck 4095
stck 4095(%r1)
stck 4095(%r15)
-#CHECK: stckf 0 # encoding: [0xb2,0x7c,0x00,0x00]
-#CHECK: stckf 0(%r1) # encoding: [0xb2,0x7c,0x10,0x00]
-#CHECK: stckf 0(%r15) # encoding: [0xb2,0x7c,0xf0,0x00]
-#CHECK: stckf 4095 # encoding: [0xb2,0x7c,0x0f,0xff]
-#CHECK: stckf 4095(%r1) # encoding: [0xb2,0x7c,0x1f,0xff]
-#CHECK: stckf 4095(%r15) # encoding: [0xb2,0x7c,0xff,0xff]
-
- stckf 0
- stckf 0(%r1)
- stckf 0(%r15)
- stckf 4095
- stckf 4095(%r1)
- stckf 4095(%r15)
-
#CHECK: stcke 0 # encoding: [0xb2,0x78,0x00,0x00]
#CHECK: stcke 0(%r1) # encoding: [0xb2,0x78,0x10,0x00]
#CHECK: stcke 0(%r15) # encoding: [0xb2,0x78,0xf0,0x00]
#CHECK: stcke 4095 # encoding: [0xb2,0x78,0x0f,0xff]
#CHECK: stcke 4095(%r1) # encoding: [0xb2,0x78,0x1f,0xff]
-#CHECK: stcke 4095(%r15) # encoding: [0xb2,0x78,0xff,0xff]
+#CHECK: stcke 4095(%r15) # encoding: [0xb2,0x78,0xff,0xff]
stcke 0
stcke 0(%r1)
stcke 0(%r15)
- stcke 4095
+ stcke 4095
stcke 4095(%r1)
stcke 4095(%r15)
-#CHECK: stfle 0 # encoding: [0xb2,0xb0,0x00,0x00]
-#CHECK: stfle 0(%r1) # encoding: [0xb2,0xb0,0x10,0x00]
-#CHECK: stfle 0(%r15) # encoding: [0xb2,0xb0,0xf0,0x00]
-#CHECK: stfle 4095 # encoding: [0xb2,0xb0,0x0f,0xff]
-#CHECK: stfle 4095(%r1) # encoding: [0xb2,0xb0,0x1f,0xff]
-#CHECK: stfle 4095(%r15) # encoding: [0xb2,0xb0,0xff,0xff]
+#CHECK: stckf 0 # encoding: [0xb2,0x7c,0x00,0x00]
+#CHECK: stckf 0(%r1) # encoding: [0xb2,0x7c,0x10,0x00]
+#CHECK: stckf 0(%r15) # encoding: [0xb2,0x7c,0xf0,0x00]
+#CHECK: stckf 4095 # encoding: [0xb2,0x7c,0x0f,0xff]
+#CHECK: stckf 4095(%r1) # encoding: [0xb2,0x7c,0x1f,0xff]
+#CHECK: stckf 4095(%r15) # encoding: [0xb2,0x7c,0xff,0xff]
- stfle 0
- stfle 0(%r1)
- stfle 0(%r15)
- stfle 4095
- stfle 4095(%r1)
- stfle 4095(%r15)
+ stckf 0
+ stckf 0(%r1)
+ stckf 0(%r15)
+ stckf 4095
+ stckf 4095(%r1)
+ stckf 4095(%r15)
+
+#CHECK: stcm %r0, 0, 0 # encoding: [0xbe,0x00,0x00,0x00]
+#CHECK: stcm %r0, 15, 4095 # encoding: [0xbe,0x0f,0x0f,0xff]
+#CHECK: stcm %r0, 0, 0(%r1) # encoding: [0xbe,0x00,0x10,0x00]
+#CHECK: stcm %r0, 0, 0(%r15) # encoding: [0xbe,0x00,0xf0,0x00]
+#CHECK: stcm %r15, 15, 4095(%r1) # encoding: [0xbe,0xff,0x1f,0xff]
+#CHECK: stcm %r0, 0, 4095(%r15) # encoding: [0xbe,0x00,0xff,0xff]
+#CHECK: stcm %r15, 0, 0 # encoding: [0xbe,0xf0,0x00,0x00]
+
+ stcm %r0, 0, 0
+ stcm %r0, 15, 4095
+ stcm %r0, 0, 0(%r1)
+ stcm %r0, 0, 0(%r15)
+ stcm %r15, 15, 4095(%r1)
+ stcm %r0, 0, 4095(%r15)
+ stcm %r15, 0, 0
+
+#CHECK: stcmh %r0, 0, -524288 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x2c]
+#CHECK: stcmh %r0, 0, -1 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x2c]
+#CHECK: stcmh %r0, 15, 0 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x2c]
+#CHECK: stcmh %r0, 15, 1 # encoding: [0xeb,0x0f,0x00,0x01,0x00,0x2c]
+#CHECK: stcmh %r0, 8, 524287 # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x2c]
+#CHECK: stcmh %r0, 8, 0(%r1) # encoding: [0xeb,0x08,0x10,0x00,0x00,0x2c]
+#CHECK: stcmh %r0, 4, 0(%r15) # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x2c]
+#CHECK: stcmh %r0, 4, 524287(%r15) # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x2c]
+#CHECK: stcmh %r0, 0, 524287(%r1) # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x2c]
+#CHECK: stcmh %r15, 0, 0 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0x2c]
+
+ stcmh %r0, 0, -524288
+ stcmh %r0, 0, -1
+ stcmh %r0, 15, 0
+ stcmh %r0, 15, 1
+ stcmh %r0, 8, 524287
+ stcmh %r0, 8, 0(%r1)
+ stcmh %r0, 4, 0(%r15)
+ stcmh %r0, 4, 524287(%r15)
+ stcmh %r0, 0, 524287(%r1)
+ stcmh %r15, 0, 0
+
+#CHECK: stcmy %r0, 0, -524288 # encoding: [0xeb,0x00,0x00,0x00,0x80,0x2d]
+#CHECK: stcmy %r0, 0, -1 # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x2d]
+#CHECK: stcmy %r0, 15, 0 # encoding: [0xeb,0x0f,0x00,0x00,0x00,0x2d]
+#CHECK: stcmy %r0, 15, 1 # encoding: [0xeb,0x0f,0x00,0x01,0x00,0x2d]
+#CHECK: stcmy %r0, 8, 524287 # encoding: [0xeb,0x08,0x0f,0xff,0x7f,0x2d]
+#CHECK: stcmy %r0, 8, 0(%r1) # encoding: [0xeb,0x08,0x10,0x00,0x00,0x2d]
+#CHECK: stcmy %r0, 4, 0(%r15) # encoding: [0xeb,0x04,0xf0,0x00,0x00,0x2d]
+#CHECK: stcmy %r0, 4, 524287(%r15) # encoding: [0xeb,0x04,0xff,0xff,0x7f,0x2d]
+#CHECK: stcmy %r0, 0, 524287(%r1) # encoding: [0xeb,0x00,0x1f,0xff,0x7f,0x2d]
+#CHECK: stcmy %r15, 0, 0 # encoding: [0xeb,0xf0,0x00,0x00,0x00,0x2d]
+
+ stcmy %r0, 0, -524288
+ stcmy %r0, 0, -1
+ stcmy %r0, 15, 0
+ stcmy %r0, 15, 1
+ stcmy %r0, 8, 524287
+ stcmy %r0, 8, 0(%r1)
+ stcmy %r0, 4, 0(%r15)
+ stcmy %r0, 4, 524287(%r15)
+ stcmy %r0, 0, 524287(%r1)
+ stcmy %r15, 0, 0
#CHECK: stcy %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x72]
#CHECK: stcy %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x72]
@@ -10119,6 +11492,20 @@
stey %f0, 524287(%r15,%r1)
stey %f15, 0
+#CHECK: stfle 0 # encoding: [0xb2,0xb0,0x00,0x00]
+#CHECK: stfle 0(%r1) # encoding: [0xb2,0xb0,0x10,0x00]
+#CHECK: stfle 0(%r15) # encoding: [0xb2,0xb0,0xf0,0x00]
+#CHECK: stfle 4095 # encoding: [0xb2,0xb0,0x0f,0xff]
+#CHECK: stfle 4095(%r1) # encoding: [0xb2,0xb0,0x1f,0xff]
+#CHECK: stfle 4095(%r15) # encoding: [0xb2,0xb0,0xff,0xff]
+
+ stfle 0
+ stfle 0(%r1)
+ stfle 0(%r15)
+ stfle 4095
+ stfle 4095(%r1)
+ stfle 4095(%r15)
+
#CHECK: stfpc 0 # encoding: [0xb2,0x9c,0x00,0x00]
#CHECK: stfpc 0(%r1) # encoding: [0xb2,0x9c,0x10,0x00]
#CHECK: stfpc 0(%r15) # encoding: [0xb2,0x9c,0xf0,0x00]
@@ -10454,28 +11841,6 @@
strl %r7,frob@PLT
strl %r8,frob@PLT
-#CHECK: strvh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3f]
-#CHECK: strvh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3f]
-#CHECK: strvh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3f]
-#CHECK: strvh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3f]
-#CHECK: strvh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3f]
-#CHECK: strvh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3f]
-#CHECK: strvh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3f]
-#CHECK: strvh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3f]
-#CHECK: strvh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3f]
-#CHECK: strvh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3f]
-
- strvh %r0,-524288
- strvh %r0,-1
- strvh %r0,0
- strvh %r0,1
- strvh %r0,524287
- strvh %r0,0(%r1)
- strvh %r0,0(%r15)
- strvh %r0,524287(%r1,%r15)
- strvh %r0,524287(%r15,%r1)
- strvh %r15,0
-
#CHECK: strv %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3e]
#CHECK: strv %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3e]
#CHECK: strv %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3e]
@@ -10520,15 +11885,27 @@
strvg %r0,524287(%r15,%r1)
strvg %r15,0
-#CHECK: svc 0 # encoding: [0x0a,0x00]
-#CHECK: svc 3 # encoding: [0x0a,0x03]
-#CHECK: svc 128 # encoding: [0x0a,0x80]
-#CHECK: svc 255 # encoding: [0x0a,0xff]
+#CHECK: strvh %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x3f]
+#CHECK: strvh %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x3f]
+#CHECK: strvh %r0, 0 # encoding: [0xe3,0x00,0x00,0x00,0x00,0x3f]
+#CHECK: strvh %r0, 1 # encoding: [0xe3,0x00,0x00,0x01,0x00,0x3f]
+#CHECK: strvh %r0, 524287 # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x3f]
+#CHECK: strvh %r0, 0(%r1) # encoding: [0xe3,0x00,0x10,0x00,0x00,0x3f]
+#CHECK: strvh %r0, 0(%r15) # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x3f]
+#CHECK: strvh %r0, 524287(%r1,%r15) # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x3f]
+#CHECK: strvh %r0, 524287(%r15,%r1) # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x3f]
+#CHECK: strvh %r15, 0 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x3f]
- svc 0
- svc 3
- svc 128
- svc 0xff
+ strvh %r0,-524288
+ strvh %r0,-1
+ strvh %r0,0
+ strvh %r0,1
+ strvh %r0,524287
+ strvh %r0,0(%r1)
+ strvh %r0,0(%r15)
+ strvh %r0,524287(%r1,%r15)
+ strvh %r0,524287(%r15,%r1)
+ strvh %r15,0
#CHECK: sty %r0, -524288 # encoding: [0xe3,0x00,0x00,0x00,0x80,0x50]
#CHECK: sty %r0, -1 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x50]
@@ -10552,6 +11929,16 @@
sty %r0, 524287(%r15,%r1)
sty %r15, 0
+#CHECK: svc 0 # encoding: [0x0a,0x00]
+#CHECK: svc 3 # encoding: [0x0a,0x03]
+#CHECK: svc 128 # encoding: [0x0a,0x80]
+#CHECK: svc 255 # encoding: [0x0a,0xff]
+
+ svc 0
+ svc 3
+ svc 128
+ svc 0xff
+
#CHECK: sxbr %f0, %f0 # encoding: [0xb3,0x4b,0x00,0x00]
#CHECK: sxbr %f0, %f13 # encoding: [0xb3,0x4b,0x00,0x0d]
#CHECK: sxbr %f8, %f8 # encoding: [0xb3,0x4b,0x00,0x88]
@@ -10734,6 +12121,194 @@
tmy 524287(%r1), 42
tmy 524287(%r15), 42
+#CHECK: tp 0(1) # encoding: [0xeb,0x00,0x00,0x00,0x00,0xc0]
+#CHECK: tp 0(1,%r1) # encoding: [0xeb,0x00,0x10,0x00,0x00,0xc0]
+#CHECK: tp 0(1,%r15) # encoding: [0xeb,0x00,0xf0,0x00,0x00,0xc0]
+#CHECK: tp 4095(1,%r1) # encoding: [0xeb,0x00,0x1f,0xff,0x00,0xc0]
+#CHECK: tp 4095(1,%r15) # encoding: [0xeb,0x00,0xff,0xff,0x00,0xc0]
+#CHECK: tp 0(16,%r1) # encoding: [0xeb,0xf0,0x10,0x00,0x00,0xc0]
+#CHECK: tp 0(16,%r15) # encoding: [0xeb,0xf0,0xf0,0x00,0x00,0xc0]
+
+ tp 0(1)
+ tp 0(1,%r1)
+ tp 0(1,%r15)
+ tp 4095(1,%r1)
+ tp 4095(1,%r15)
+ tp 0(16,%r1)
+ tp 0(16,%r15)
+
+#CHECK: tr 0(1), 0 # encoding: [0xdc,0x00,0x00,0x00,0x00,0x00]
+#CHECK: tr 0(1), 0(%r1) # encoding: [0xdc,0x00,0x00,0x00,0x10,0x00]
+#CHECK: tr 0(1), 0(%r15) # encoding: [0xdc,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: tr 0(1), 4095 # encoding: [0xdc,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: tr 0(1), 4095(%r1) # encoding: [0xdc,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: tr 0(1), 4095(%r15) # encoding: [0xdc,0x00,0x00,0x00,0xff,0xff]
+#CHECK: tr 0(1,%r1), 0 # encoding: [0xdc,0x00,0x10,0x00,0x00,0x00]
+#CHECK: tr 0(1,%r15), 0 # encoding: [0xdc,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: tr 4095(1,%r1), 0 # encoding: [0xdc,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: tr 4095(1,%r15), 0 # encoding: [0xdc,0x00,0xff,0xff,0x00,0x00]
+#CHECK: tr 0(256,%r1), 0 # encoding: [0xdc,0xff,0x10,0x00,0x00,0x00]
+#CHECK: tr 0(256,%r15), 0 # encoding: [0xdc,0xff,0xf0,0x00,0x00,0x00]
+
+ tr 0(1), 0
+ tr 0(1), 0(%r1)
+ tr 0(1), 0(%r15)
+ tr 0(1), 4095
+ tr 0(1), 4095(%r1)
+ tr 0(1), 4095(%r15)
+ tr 0(1,%r1), 0
+ tr 0(1,%r15), 0
+ tr 4095(1,%r1), 0
+ tr 4095(1,%r15), 0
+ tr 0(256,%r1), 0
+ tr 0(256,%r15), 0
+
+#CHECK: tre %r0, %r0 # encoding: [0xb2,0xa5,0x00,0x00]
+#CHECK: tre %r0, %r15 # encoding: [0xb2,0xa5,0x00,0x0f]
+#CHECK: tre %r14, %r0 # encoding: [0xb2,0xa5,0x00,0xe0]
+#CHECK: tre %r6, %r8 # encoding: [0xb2,0xa5,0x00,0x68]
+
+ tre %r0, %r0
+ tre %r0, %r15
+ tre %r14, %r0
+ tre %r6, %r8
+
+#CHECK: troo %r0, %r0 # encoding: [0xb9,0x93,0x00,0x00]
+#CHECK: troo %r0, %r15 # encoding: [0xb9,0x93,0x00,0x0f]
+#CHECK: troo %r14, %r0 # encoding: [0xb9,0x93,0x00,0xe0]
+#CHECK: troo %r6, %r8 # encoding: [0xb9,0x93,0x00,0x68]
+#CHECK: troo %r4, %r13, 0 # encoding: [0xb9,0x93,0x00,0x4d]
+#CHECK: troo %r4, %r13, 15 # encoding: [0xb9,0x93,0xf0,0x4d]
+
+ troo %r0, %r0
+ troo %r0, %r15
+ troo %r14, %r0
+ troo %r6, %r8
+ troo %r4, %r13, 0
+ troo %r4, %r13, 15
+
+#CHECK: trot %r0, %r0 # encoding: [0xb9,0x92,0x00,0x00]
+#CHECK: trot %r0, %r15 # encoding: [0xb9,0x92,0x00,0x0f]
+#CHECK: trot %r14, %r0 # encoding: [0xb9,0x92,0x00,0xe0]
+#CHECK: trot %r6, %r8 # encoding: [0xb9,0x92,0x00,0x68]
+#CHECK: trot %r4, %r13, 0 # encoding: [0xb9,0x92,0x00,0x4d]
+#CHECK: trot %r4, %r13, 15 # encoding: [0xb9,0x92,0xf0,0x4d]
+
+ trot %r0, %r0
+ trot %r0, %r15
+ trot %r14, %r0
+ trot %r6, %r8
+ trot %r4, %r13, 0
+ trot %r4, %r13, 15
+
+#CHECK: trt 0(1), 0 # encoding: [0xdd,0x00,0x00,0x00,0x00,0x00]
+#CHECK: trt 0(1), 0(%r1) # encoding: [0xdd,0x00,0x00,0x00,0x10,0x00]
+#CHECK: trt 0(1), 0(%r15) # encoding: [0xdd,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: trt 0(1), 4095 # encoding: [0xdd,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: trt 0(1), 4095(%r1) # encoding: [0xdd,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: trt 0(1), 4095(%r15) # encoding: [0xdd,0x00,0x00,0x00,0xff,0xff]
+#CHECK: trt 0(1,%r1), 0 # encoding: [0xdd,0x00,0x10,0x00,0x00,0x00]
+#CHECK: trt 0(1,%r15), 0 # encoding: [0xdd,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: trt 4095(1,%r1), 0 # encoding: [0xdd,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: trt 4095(1,%r15), 0 # encoding: [0xdd,0x00,0xff,0xff,0x00,0x00]
+#CHECK: trt 0(256,%r1), 0 # encoding: [0xdd,0xff,0x10,0x00,0x00,0x00]
+#CHECK: trt 0(256,%r15), 0 # encoding: [0xdd,0xff,0xf0,0x00,0x00,0x00]
+
+ trt 0(1), 0
+ trt 0(1), 0(%r1)
+ trt 0(1), 0(%r15)
+ trt 0(1), 4095
+ trt 0(1), 4095(%r1)
+ trt 0(1), 4095(%r15)
+ trt 0(1,%r1), 0
+ trt 0(1,%r15), 0
+ trt 4095(1,%r1), 0
+ trt 4095(1,%r15), 0
+ trt 0(256,%r1), 0
+ trt 0(256,%r15), 0
+
+#CHECK: trte %r0, %r0 # encoding: [0xb9,0xbf,0x00,0x00]
+#CHECK: trte %r0, %r15 # encoding: [0xb9,0xbf,0x00,0x0f]
+#CHECK: trte %r14, %r0 # encoding: [0xb9,0xbf,0x00,0xe0]
+#CHECK: trte %r6, %r8 # encoding: [0xb9,0xbf,0x00,0x68]
+#CHECK: trte %r4, %r13, 0 # encoding: [0xb9,0xbf,0x00,0x4d]
+#CHECK: trte %r4, %r13, 15 # encoding: [0xb9,0xbf,0xf0,0x4d]
+
+ trte %r0, %r0
+ trte %r0, %r15
+ trte %r14, %r0
+ trte %r6, %r8
+ trte %r4, %r13, 0
+ trte %r4, %r13, 15
+
+#CHECK: trto %r0, %r0 # encoding: [0xb9,0x91,0x00,0x00]
+#CHECK: trto %r0, %r15 # encoding: [0xb9,0x91,0x00,0x0f]
+#CHECK: trto %r14, %r0 # encoding: [0xb9,0x91,0x00,0xe0]
+#CHECK: trto %r6, %r8 # encoding: [0xb9,0x91,0x00,0x68]
+#CHECK: trto %r4, %r13, 0 # encoding: [0xb9,0x91,0x00,0x4d]
+#CHECK: trto %r4, %r13, 15 # encoding: [0xb9,0x91,0xf0,0x4d]
+
+ trto %r0, %r0
+ trto %r0, %r15
+ trto %r14, %r0
+ trto %r6, %r8
+ trto %r4, %r13, 0
+ trto %r4, %r13, 15
+
+#CHECK: trtr 0(1), 0 # encoding: [0xd0,0x00,0x00,0x00,0x00,0x00]
+#CHECK: trtr 0(1), 0(%r1) # encoding: [0xd0,0x00,0x00,0x00,0x10,0x00]
+#CHECK: trtr 0(1), 0(%r15) # encoding: [0xd0,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: trtr 0(1), 4095 # encoding: [0xd0,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: trtr 0(1), 4095(%r1) # encoding: [0xd0,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: trtr 0(1), 4095(%r15) # encoding: [0xd0,0x00,0x00,0x00,0xff,0xff]
+#CHECK: trtr 0(1,%r1), 0 # encoding: [0xd0,0x00,0x10,0x00,0x00,0x00]
+#CHECK: trtr 0(1,%r15), 0 # encoding: [0xd0,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: trtr 4095(1,%r1), 0 # encoding: [0xd0,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: trtr 4095(1,%r15), 0 # encoding: [0xd0,0x00,0xff,0xff,0x00,0x00]
+#CHECK: trtr 0(256,%r1), 0 # encoding: [0xd0,0xff,0x10,0x00,0x00,0x00]
+#CHECK: trtr 0(256,%r15), 0 # encoding: [0xd0,0xff,0xf0,0x00,0x00,0x00]
+
+ trtr 0(1), 0
+ trtr 0(1), 0(%r1)
+ trtr 0(1), 0(%r15)
+ trtr 0(1), 4095
+ trtr 0(1), 4095(%r1)
+ trtr 0(1), 4095(%r15)
+ trtr 0(1,%r1), 0
+ trtr 0(1,%r15), 0
+ trtr 4095(1,%r1), 0
+ trtr 4095(1,%r15), 0
+ trtr 0(256,%r1), 0
+ trtr 0(256,%r15), 0
+
+#CHECK: trtre %r0, %r0 # encoding: [0xb9,0xbd,0x00,0x00]
+#CHECK: trtre %r0, %r15 # encoding: [0xb9,0xbd,0x00,0x0f]
+#CHECK: trtre %r14, %r0 # encoding: [0xb9,0xbd,0x00,0xe0]
+#CHECK: trtre %r6, %r8 # encoding: [0xb9,0xbd,0x00,0x68]
+#CHECK: trtre %r4, %r13, 0 # encoding: [0xb9,0xbd,0x00,0x4d]
+#CHECK: trtre %r4, %r13, 15 # encoding: [0xb9,0xbd,0xf0,0x4d]
+
+ trtre %r0, %r0
+ trtre %r0, %r15
+ trtre %r14, %r0
+ trtre %r6, %r8
+ trtre %r4, %r13, 0
+ trtre %r4, %r13, 15
+
+#CHECK: trtt %r0, %r0 # encoding: [0xb9,0x90,0x00,0x00]
+#CHECK: trtt %r0, %r15 # encoding: [0xb9,0x90,0x00,0x0f]
+#CHECK: trtt %r14, %r0 # encoding: [0xb9,0x90,0x00,0xe0]
+#CHECK: trtt %r6, %r8 # encoding: [0xb9,0x90,0x00,0x68]
+#CHECK: trtt %r4, %r13, 0 # encoding: [0xb9,0x90,0x00,0x4d]
+#CHECK: trtt %r4, %r13, 15 # encoding: [0xb9,0x90,0xf0,0x4d]
+
+ trtt %r0, %r0
+ trtt %r0, %r15
+ trtt %r14, %r0
+ trtt %r6, %r8
+ trtt %r4, %r13, 0
+ trtt %r4, %r13, 15
+
#CHECK: ts 0 # encoding: [0x93,0x00,0x00,0x00]
#CHECK: ts 0(%r1) # encoding: [0x93,0x00,0x10,0x00]
#CHECK: ts 0(%r15) # encoding: [0x93,0x00,0xf0,0x00]
@@ -10748,6 +12323,92 @@
ts 4095(%r1)
ts 4095(%r15)
+#CHECK: unpk 0(1), 0(1) # encoding: [0xf3,0x00,0x00,0x00,0x00,0x00]
+#CHECK: unpk 0(1), 0(1,%r1) # encoding: [0xf3,0x00,0x00,0x00,0x10,0x00]
+#CHECK: unpk 0(1), 0(1,%r15) # encoding: [0xf3,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: unpk 0(1), 4095(1) # encoding: [0xf3,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: unpk 0(1), 4095(1,%r1) # encoding: [0xf3,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: unpk 0(1), 4095(1,%r15) # encoding: [0xf3,0x00,0x00,0x00,0xff,0xff]
+#CHECK: unpk 0(1,%r1), 0(1) # encoding: [0xf3,0x00,0x10,0x00,0x00,0x00]
+#CHECK: unpk 0(1,%r15), 0(1) # encoding: [0xf3,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: unpk 4095(1,%r1), 0(1) # encoding: [0xf3,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: unpk 4095(1,%r15), 0(1) # encoding: [0xf3,0x00,0xff,0xff,0x00,0x00]
+#CHECK: unpk 0(16,%r1), 0(1) # encoding: [0xf3,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: unpk 0(16,%r15), 0(1) # encoding: [0xf3,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: unpk 0(1), 0(16,%r1) # encoding: [0xf3,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: unpk 0(1), 0(16,%r15) # encoding: [0xf3,0x0f,0x00,0x00,0xf0,0x00]
+
+ unpk 0(1), 0(1)
+ unpk 0(1), 0(1,%r1)
+ unpk 0(1), 0(1,%r15)
+ unpk 0(1), 4095(1)
+ unpk 0(1), 4095(1,%r1)
+ unpk 0(1), 4095(1,%r15)
+ unpk 0(1,%r1), 0(1)
+ unpk 0(1,%r15), 0(1)
+ unpk 4095(1,%r1), 0(1)
+ unpk 4095(1,%r15), 0(1)
+ unpk 0(16,%r1), 0(1)
+ unpk 0(16,%r15), 0(1)
+ unpk 0(1), 0(16,%r1)
+ unpk 0(1), 0(16,%r15)
+
+#CHECK: unpka 0(1), 0 # encoding: [0xea,0x00,0x00,0x00,0x00,0x00]
+#CHECK: unpka 0(1), 0(%r1) # encoding: [0xea,0x00,0x00,0x00,0x10,0x00]
+#CHECK: unpka 0(1), 0(%r15) # encoding: [0xea,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: unpka 0(1), 4095 # encoding: [0xea,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: unpka 0(1), 4095(%r1) # encoding: [0xea,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: unpka 0(1), 4095(%r15) # encoding: [0xea,0x00,0x00,0x00,0xff,0xff]
+#CHECK: unpka 0(1,%r1), 0 # encoding: [0xea,0x00,0x10,0x00,0x00,0x00]
+#CHECK: unpka 0(1,%r15), 0 # encoding: [0xea,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: unpka 4095(1,%r1), 0 # encoding: [0xea,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: unpka 4095(1,%r15), 0 # encoding: [0xea,0x00,0xff,0xff,0x00,0x00]
+#CHECK: unpka 0(256,%r1), 0 # encoding: [0xea,0xff,0x10,0x00,0x00,0x00]
+#CHECK: unpka 0(256,%r15), 0 # encoding: [0xea,0xff,0xf0,0x00,0x00,0x00]
+
+ unpka 0(1), 0
+ unpka 0(1), 0(%r1)
+ unpka 0(1), 0(%r15)
+ unpka 0(1), 4095
+ unpka 0(1), 4095(%r1)
+ unpka 0(1), 4095(%r15)
+ unpka 0(1,%r1), 0
+ unpka 0(1,%r15), 0
+ unpka 4095(1,%r1), 0
+ unpka 4095(1,%r15), 0
+ unpka 0(256,%r1), 0
+ unpka 0(256,%r15), 0
+
+#CHECK: unpku 0(1), 0 # encoding: [0xe2,0x00,0x00,0x00,0x00,0x00]
+#CHECK: unpku 0(1), 0(%r1) # encoding: [0xe2,0x00,0x00,0x00,0x10,0x00]
+#CHECK: unpku 0(1), 0(%r15) # encoding: [0xe2,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: unpku 0(1), 4095 # encoding: [0xe2,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: unpku 0(1), 4095(%r1) # encoding: [0xe2,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: unpku 0(1), 4095(%r15) # encoding: [0xe2,0x00,0x00,0x00,0xff,0xff]
+#CHECK: unpku 0(1,%r1), 0 # encoding: [0xe2,0x00,0x10,0x00,0x00,0x00]
+#CHECK: unpku 0(1,%r15), 0 # encoding: [0xe2,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: unpku 4095(1,%r1), 0 # encoding: [0xe2,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: unpku 4095(1,%r15), 0 # encoding: [0xe2,0x00,0xff,0xff,0x00,0x00]
+#CHECK: unpku 0(256,%r1), 0 # encoding: [0xe2,0xff,0x10,0x00,0x00,0x00]
+#CHECK: unpku 0(256,%r15), 0 # encoding: [0xe2,0xff,0xf0,0x00,0x00,0x00]
+
+ unpku 0(1), 0
+ unpku 0(1), 0(%r1)
+ unpku 0(1), 0(%r15)
+ unpku 0(1), 4095
+ unpku 0(1), 4095(%r1)
+ unpku 0(1), 4095(%r15)
+ unpku 0(1,%r1), 0
+ unpku 0(1,%r15), 0
+ unpku 4095(1,%r1), 0
+ unpku 4095(1,%r15), 0
+ unpku 0(256,%r1), 0
+ unpku 0(256,%r15), 0
+
+#CHECK: upt # encoding: [0x01,0x02]
+
+ upt
+
#CHECK: x %r0, 0 # encoding: [0x57,0x00,0x00,0x00]
#CHECK: x %r0, 4095 # encoding: [0x57,0x00,0x0f,0xff]
#CHECK: x %r0, 0(%r1) # encoding: [0x57,0x00,0x10,0x00]
@@ -10907,3 +12568,33 @@
xy %r0, 524287(%r1,%r15)
xy %r0, 524287(%r15,%r1)
xy %r15, 0
+
+#CHECK: zap 0(1), 0(1) # encoding: [0xf8,0x00,0x00,0x00,0x00,0x00]
+#CHECK: zap 0(1), 0(1,%r1) # encoding: [0xf8,0x00,0x00,0x00,0x10,0x00]
+#CHECK: zap 0(1), 0(1,%r15) # encoding: [0xf8,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: zap 0(1), 4095(1) # encoding: [0xf8,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: zap 0(1), 4095(1,%r1) # encoding: [0xf8,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: zap 0(1), 4095(1,%r15) # encoding: [0xf8,0x00,0x00,0x00,0xff,0xff]
+#CHECK: zap 0(1,%r1), 0(1) # encoding: [0xf8,0x00,0x10,0x00,0x00,0x00]
+#CHECK: zap 0(1,%r15), 0(1) # encoding: [0xf8,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: zap 4095(1,%r1), 0(1) # encoding: [0xf8,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: zap 4095(1,%r15), 0(1) # encoding: [0xf8,0x00,0xff,0xff,0x00,0x00]
+#CHECK: zap 0(16,%r1), 0(1) # encoding: [0xf8,0xf0,0x10,0x00,0x00,0x00]
+#CHECK: zap 0(16,%r15), 0(1) # encoding: [0xf8,0xf0,0xf0,0x00,0x00,0x00]
+#CHECK: zap 0(1), 0(16,%r1) # encoding: [0xf8,0x0f,0x00,0x00,0x10,0x00]
+#CHECK: zap 0(1), 0(16,%r15) # encoding: [0xf8,0x0f,0x00,0x00,0xf0,0x00]
+
+ zap 0(1), 0(1)
+ zap 0(1), 0(1,%r1)
+ zap 0(1), 0(1,%r15)
+ zap 0(1), 4095(1)
+ zap 0(1), 4095(1,%r1)
+ zap 0(1), 4095(1,%r15)
+ zap 0(1,%r1), 0(1)
+ zap 0(1,%r15), 0(1)
+ zap 4095(1,%r1), 0(1)
+ zap 4095(1,%r15), 0(1)
+ zap 0(16,%r1), 0(1)
+ zap 0(16,%r15), 0(1)
+ zap 0(1), 0(16,%r1)
+ zap 0(1), 0(16,%r15)
diff --git a/test/Object/Inputs/COFF/empty-drectve.yaml b/test/Object/Inputs/COFF/empty-drectve.yaml
new file mode 100644
index 000000000000..af288807e3ad
--- /dev/null
+++ b/test/Object/Inputs/COFF/empty-drectve.yaml
@@ -0,0 +1,14 @@
+--- !COFF
+header:
+ Machine: IMAGE_FILE_MACHINE_I386
+sections:
+ - Name: .drectve
+ Characteristics: [ IMAGE_SCN_LNK_INFO, IMAGE_SCN_LNK_REMOVE ]
+ SectionData: ''
+symbols:
+ - Name: .drectve
+ Value: 0
+ SectionNumber: 1
+ SimpleType: IMAGE_SYM_TYPE_NULL
+ ComplexType: IMAGE_SYM_DTYPE_NULL
+ StorageClass: IMAGE_SYM_CLASS_STATIC
diff --git a/test/Object/X86/archive-symbol-table.s b/test/Object/X86/archive-symbol-table.s
new file mode 100644
index 000000000000..2e6fcbed60b1
--- /dev/null
+++ b/test/Object/X86/archive-symbol-table.s
@@ -0,0 +1,19 @@
+# RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64-pc-linux
+# RUN: rm -f %t
+# RUN: llvm-ar rcs %t %t.o
+# RUN: llvm-nm -M %t | FileCheck %s
+
+# Test that weak undefined symbols don't show up in the archive symbol
+# table.
+
+.global foo
+foo:
+.weak bar
+.quad bar
+
+# CHECK: Archive map
+# CHECK-NEXT: foo in archive-symbol-table.s.tmp.o
+# CHECK-NOT: in
+# CHECK: archive-symbol-table.s.tmp.o
+# CHECK-NEXT: w bar
+# CHECK-NEXT: T foo
diff --git a/test/Object/X86/nm-ir.ll b/test/Object/X86/nm-ir.ll
index 29f7a5c7018c..c90f67b15160 100644
--- a/test/Object/X86/nm-ir.ll
+++ b/test/Object/X86/nm-ir.ll
@@ -12,7 +12,7 @@
; CHECK-NEXT: C g3
; CHECK-NOT: g4
; CHECK-NEXT: T global_asm_sym
-; CHECK-NEXT: D ifunc_f1
+; CHECK-NEXT: T ifunc_f1
; CHECK-NEXT: t local_asm_sym
; CHECK-NEXT: U undef_asm_sy
diff --git a/test/Object/coff-empty-drectve.test b/test/Object/coff-empty-drectve.test
new file mode 100644
index 000000000000..f76d7bf72716
--- /dev/null
+++ b/test/Object/coff-empty-drectve.test
@@ -0,0 +1,3 @@
+RUN: yaml2obj %p/Inputs/COFF/empty-drectve.yaml | llvm-readobj -coff-directives - | FileCheck %s
+
+CHECK: Directive(s): {{$}}
diff --git a/test/Object/invalid.test b/test/Object/invalid.test
index fc1a77b2c0c0..dcbac32f7196 100644
--- a/test/Object/invalid.test
+++ b/test/Object/invalid.test
@@ -53,7 +53,7 @@ INVALID-SYMTAB-SIZE: size is not a multiple of sh_entsize
RUN: not llvm-readobj -t %p/Inputs/invalid-xindex-size.elf 2>&1 | FileCheck --check-prefix=INVALID-XINDEX-SIZE %s
-INVALID-XINDEX-SIZE: Invalid data was encountered while parsing the file.
+INVALID-XINDEX-SIZE: Invalid data was encountered while parsing the file
RUN: not llvm-readobj -t %p/Inputs/invalid-e_shnum.elf 2>&1 | FileCheck --check-prefix=INVALID-SH-NUM %s
INVALID-SH-NUM: invalid e_phentsize
@@ -77,7 +77,7 @@ RUN: FileCheck --check-prefix=INVALID-SECTION-SIZE2 %s
INVALID-SECTION-SIZE2: invalid section offset
RUN: not llvm-readobj -t %p/Inputs/invalid-sections-num.elf 2>&1 | FileCheck --check-prefix=INVALID-SECTION-NUM %s
-INVALID-SECTION-NUM: Invalid data was encountered while parsing the file.
+INVALID-SECTION-NUM: Invalid data was encountered while parsing the file
RUN: not llvm-readobj -r %p/Inputs/invalid-rel-sym.elf 2>&1 | FileCheck --check-prefix=INVALID-REL-SYM %s
INVALID-REL-SYM: invalid section offset
diff --git a/test/Object/wasm-invalid-start.test b/test/Object/wasm-invalid-start.test
new file mode 100644
index 000000000000..12f75676345f
--- /dev/null
+++ b/test/Object/wasm-invalid-start.test
@@ -0,0 +1,10 @@
+# RUN: yaml2obj %s | not llvm-objdump -h - 2>&1 | FileCheck %s
+
+!WASM
+FileHeader:
+ Version: 0x00000001
+Sections:
+ - Type: START
+ StartFunction: 0
+
+# CHECK: {{.*}}: Invalid start function
diff --git a/test/ObjectYAML/wasm/export_section.yaml b/test/ObjectYAML/wasm/export_section.yaml
index 1d1a16fb8335..89ebee328246 100644
--- a/test/ObjectYAML/wasm/export_section.yaml
+++ b/test/ObjectYAML/wasm/export_section.yaml
@@ -5,12 +5,18 @@ FileHeader:
Sections:
- Type: EXPORT
Exports:
- - Name: foo
- Kind: FUNCTION
- Index: 0
- - Name: bar
+ - Name: function_export
Kind: FUNCTION
Index: 1
+ - Name: global_export
+ Kind: GLOBAL
+ Index: 1
+ - Name: memory_export
+ Kind: MEMORY
+ Index: 0
+ - Name: table_export
+ Kind: TABLE
+ Index: 0
...
# CHECK: --- !WASM
# CHECK: FileHeader:
@@ -18,10 +24,16 @@ Sections:
# CHECK: Sections:
# CHECK: - Type: EXPORT
# CHECK: Exports:
-# CHECK: - Name: foo
-# CHECK: Kind: FUNCTION
-# CHECK: Index: 0
-# CHECK: - Name: bar
+# CHECK: - Name: function_export
# CHECK: Kind: FUNCTION
# CHECK: Index: 1
+# CHECK: - Name: global_export
+# CHECK: Kind: GLOBAL
+# CHECK: Index: 1
+# CHECK: - Name: memory_export
+# CHECK: Kind: MEMORY
+# CHECK: Index: 0
+# CHECK: - Name: table_export
+# CHECK: Kind: TABLE
+# CHECK: Index: 0
# CHECK: ...
diff --git a/test/ObjectYAML/wasm/function_section.yaml b/test/ObjectYAML/wasm/function_section.yaml
index 39e6b75d5cdc..571b762787a2 100644
--- a/test/ObjectYAML/wasm/function_section.yaml
+++ b/test/ObjectYAML/wasm/function_section.yaml
@@ -4,9 +4,7 @@ FileHeader:
Version: 0x00000001
Sections:
- Type: FUNCTION
- FunctionTypes:
- - 1
- - 0
+ FunctionTypes: [ 1, 0 ]
...
# CHECK: --- !WASM
# CHECK: FileHeader:
diff --git a/test/ObjectYAML/wasm/import_section.yaml b/test/ObjectYAML/wasm/import_section.yaml
index 52f466a00b66..115d4cc0bd6b 100644
--- a/test/ObjectYAML/wasm/import_section.yaml
+++ b/test/ObjectYAML/wasm/import_section.yaml
@@ -9,19 +9,32 @@ Sections:
ParamTypes:
- I32
- Type: IMPORT
- Imports:
+ Imports:
- Module: foo
- Field: bar
+ Field: imported_function
Kind: FUNCTION
SigIndex: 0
- Module: fiz
- Field: baz
+ Field: imported_global
Kind: GLOBAL
GlobalType: I32
GlobalMutable: false
- - Type: FUNCTION
- FunctionTypes:
- - 0
+ - Module: foo
+ Field: imported_memory
+ Kind: MEMORY
+ Memory:
+ Flags: 0x00000001
+ Initial: 0x00000010
+ Maximum: 0x00000011
+ - Module: foo
+ Field: imported_table
+ Kind: TABLE
+ Table:
+ ElemType: ANYFUNC
+ Limits:
+ Flags: 0x00000001
+ Initial: 0x00000020
+ Maximum: 0x00000022
...
# CHECK: --- !WASM
# CHECK: FileHeader:
@@ -30,12 +43,28 @@ Sections:
# CHECK: - Type: IMPORT
# CHECK: Imports:
# CHECK: - Module: foo
-# CHECK: Field: bar
+# CHECK: Field: imported_function
# CHECK: Kind: FUNCTION
# CHECK: SigIndex: 0
# CHECK: - Module: fiz
-# CHECK: Field: baz
+# CHECK: Field: imported_global
# CHECK: Kind: GLOBAL
# CHECK: GlobalType: I32
# CHECK: GlobalMutable: false
+# CHECK: - Module: foo
+# CHECK: Field: imported_memory
+# CHECK: Kind: MEMORY
+# CHECK: Memory:
+# CHECK: Flags: 0x00000001
+# CHECK: Initial: 0x00000010
+# CHECK: Maximum: 0x00000011
+# CHECK: - Module: foo
+# CHECK: Field: imported_table
+# CHECK: Kind: TABLE
+# CHECK: Table:
+# CHECK: ElemType: ANYFUNC
+# CHECK: Limits:
+# CHECK: Flags: 0x00000001
+# CHECK: Initial: 0x00000020
+# CHECK: Maximum: 0x00000022
# CHECK: ...
diff --git a/test/ObjectYAML/wasm/start_section.yaml b/test/ObjectYAML/wasm/start_section.yaml
index 41301a620037..38feebcdf993 100644
--- a/test/ObjectYAML/wasm/start_section.yaml
+++ b/test/ObjectYAML/wasm/start_section.yaml
@@ -1,8 +1,17 @@
# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
--- !WASM
FileHeader:
Version: 0x00000001
Sections:
+ - Type: TYPE
+ Signatures:
+ - ReturnType: I32
+ ParamTypes:
+ - F32
+ - F32
+ - Type: FUNCTION
+ FunctionTypes: [ 0, 0, 0 ]
- Type: START
StartFunction: 1
...
diff --git a/test/TableGen/AsmVariant.td b/test/TableGen/AsmVariant.td
index 6c50241e5ae1..cb5d32385d3b 100644
--- a/test/TableGen/AsmVariant.td
+++ b/test/TableGen/AsmVariant.td
@@ -1,6 +1,6 @@
// RUN: llvm-tblgen -gen-asm-matcher -I %p/../../include %s | FileCheck %s
-// Check that cpecifying AsmVariant works correctly
+// Check that specifying AsmVariant works correctly
include "llvm/Target/Target.td"
diff --git a/test/TableGen/RegisterEncoder.td b/test/TableGen/RegisterEncoder.td
new file mode 100644
index 000000000000..a0472c5feffa
--- /dev/null
+++ b/test/TableGen/RegisterEncoder.td
@@ -0,0 +1,35 @@
+// RUN: llvm-tblgen -gen-emitter -I %p/../../include %s | FileCheck %s
+
+// Check that EncoderMethod for RegisterOperand is working correctly
+
+include "llvm/Target/Target.td"
+
+def ArchInstrInfo : InstrInfo { }
+
+def Arch : Target {
+ let InstructionSet = ArchInstrInfo;
+}
+
+def Reg : Register<"reg">;
+
+def RegClass : RegisterClass<"foo", [i32], 0, (add Reg)>;
+
+def RegOperand : RegisterOperand<RegClass> {
+ let EncoderMethod = "barEncoder";
+}
+
+def foo : Instruction {
+ let Size = 1;
+
+ let OutOperandList = (outs);
+ let InOperandList = (ins RegOperand:$bar);
+
+ bits<8> bar;
+ bits<8> Inst = bar;
+}
+
+// CHECK: case ::foo: {
+// CHECK: op = barEncoder
+// CHECK: Value |= op & UINT64_C(255);
+// CHECK: break;
+// CHECK: } \ No newline at end of file
diff --git a/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll b/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
index 509a4d7bfa18..8313cfac04ee 100644
--- a/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
+++ b/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
; This test checks to make sure that the CodeExtractor
; properly sets the entry count for the function that is
diff --git a/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll b/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
index 425e96973596..8e362080dc48 100644
--- a/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
+++ b/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis -S | FileCheck %s
; This test checks to make sure that CodeExtractor updates
; the exit branch probabilities for multiple exit blocks.
diff --git a/test/Transforms/CodeExtractor/PartialInlineAnd.ll b/test/Transforms/CodeExtractor/PartialInlineAnd.ll
index e981a5ba5816..d32d834d2df3 100644
--- a/test/Transforms/CodeExtractor/PartialInlineAnd.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineAnd.ll
@@ -1,7 +1,7 @@
; RUN: opt < %s -partial-inliner -S | FileCheck %s
; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
-; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
; Function Attrs: nounwind uwtable
define i32 @bar(i32 %arg) local_unnamed_addr #0 {
diff --git a/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll b/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
new file mode 100644
index 000000000000..3a7a9752e507
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S | FileCheck %s
+
+define i32 @Func(i1 %cond, i32* align 4 %align.val) !prof !1 {
+; CHECK: @Func({{.*}}) !prof [[REMAINCOUNT:![0-9]+]]
+entry:
+ br i1 %cond, label %if.then, label %return
+if.then:
+ ; Dummy store to have more than 0 uses
+ store i32 10, i32* %align.val, align 4
+ br label %return
+return: ; preds = %entry
+ ret i32 0
+}
+
+define internal i32 @Caller1(i1 %cond, i32* align 2 %align.val) !prof !3{
+entry:
+; CHECK-LABEL: @Caller1
+; CHECK: br
+; CHECK: call void @Func.1_
+; CHECK: br
+; CHECK: call void @Func.1_
+ %val = call i32 @Func(i1 %cond, i32* %align.val)
+ %val2 = call i32 @Func(i1 %cond, i32* %align.val)
+ ret i32 %val
+}
+
+define internal i32 @Caller2(i1 %cond, i32* align 2 %align.val) !prof !2{
+entry:
+; CHECK-LABEL: @Caller2
+; CHECK: br
+; CHECK: call void @Func.1_
+ %val = call i32 @Func(i1 %cond, i32* %align.val)
+ ret i32 %val
+}
+
+; CHECK: [[REMAINCOUNT]] = !{!"function_entry_count", i64 150}
+!1 = !{!"function_entry_count", i64 200}
+!2 = !{!"function_entry_count", i64 10}
+!3 = !{!"function_entry_count", i64 20}
+
diff --git a/test/Transforms/CodeExtractor/PartialInlineHighCost.ll b/test/Transforms/CodeExtractor/PartialInlineHighCost.ll
new file mode 100644
index 000000000000..e43a94dc6c37
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineHighCost.ll
@@ -0,0 +1,107 @@
+; The outlined region has high frequency and the outlining
+; call sequence is expensive (input, output, multiple exit etc)
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s
+
+
+; Function Attrs: nounwind
+define i32 @bar_hot_outline_region(i32 %arg) local_unnamed_addr #0 {
+bb:
+ %tmp = icmp slt i32 %arg, 0
+ br i1 %tmp, label %bb1, label %bb16, !prof !1
+
+bb1: ; preds = %bb
+ %tmp2 = tail call i32 (...) @foo() #0
+ %tmp3 = tail call i32 (...) @foo() #0
+ %tmp4 = tail call i32 (...) @foo() #0
+ %tmp5 = tail call i32 (...) @foo() #0
+ %tmp6 = tail call i32 (...) @foo() #0
+ %tmp7 = tail call i32 (...) @foo() #0
+ %tmp8 = add nsw i32 %arg, 1
+ %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+ %tmp10 = tail call i32 (...) @foo() #0
+ %tmp11 = icmp eq i32 %tmp10, 0
+ br i1 %tmp11, label %bb12, label %bb16
+
+bb12: ; preds = %bb1
+ %tmp13 = tail call i32 (...) @foo() #0
+ %tmp14 = icmp eq i32 %tmp13, 0
+ %tmp15 = select i1 %tmp14, i32 0, i32 3
+ br label %bb16
+
+bb16: ; preds = %bb12, %bb1, %bb
+ %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+ ret i32 %tmp17
+}
+
+define i32 @bar_cold_outline_region(i32 %arg) local_unnamed_addr #0 {
+bb:
+ %tmp = icmp slt i32 %arg, 0
+ br i1 %tmp, label %bb1, label %bb16, !prof !2
+
+bb1: ; preds = %bb
+ %tmp2 = tail call i32 (...) @foo() #0
+ %tmp3 = tail call i32 (...) @foo() #0
+ %tmp4 = tail call i32 (...) @foo() #0
+ %tmp5 = tail call i32 (...) @foo() #0
+ %tmp6 = tail call i32 (...) @foo() #0
+ %tmp7 = tail call i32 (...) @foo() #0
+ %tmp8 = add nsw i32 %arg, 1
+ %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+ %tmp10 = tail call i32 (...) @foo() #0
+ %tmp11 = icmp eq i32 %tmp10, 0
+ br i1 %tmp11, label %bb12, label %bb16
+
+bb12: ; preds = %bb1
+ %tmp13 = tail call i32 (...) @foo() #0
+ %tmp14 = icmp eq i32 %tmp13, 0
+ %tmp15 = select i1 %tmp14, i32 0, i32 3
+ br label %bb16
+
+bb16: ; preds = %bb12, %bb1, %bb
+ %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+ ret i32 %tmp17
+}
+
+; Function Attrs: nounwind
+declare i32 @foo(...) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+declare i32 @goo(i32) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller
+; CHECK-NOT: br i1
+; CHECK-NOT: call{{.*}}bar_hot_outline_region.
+; NOCOST-LABEL: @dummy_caller
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar_hot_outline_region.
+
+ %tmp = tail call i32 @bar_hot_outline_region(i32 %arg)
+ ret i32 %tmp
+}
+
+define i32 @dummy_caller2(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller2
+; CHECK: br i1
+; CHECK: call{{.*}}bar_cold_outline_region.
+; NOCOST-LABEL: @dummy_caller2
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar_cold_outline_region.
+
+ %tmp = tail call i32 @bar_cold_outline_region(i32 %arg)
+ ret i32 %tmp
+}
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (trunk 301898)"}
+!1 = !{!"branch_weights", i32 2000, i32 1}
+!2 = !{!"branch_weights", i32 1, i32 100}
diff --git a/test/Transforms/CodeExtractor/PartialInlineOr.ll b/test/Transforms/CodeExtractor/PartialInlineOr.ll
index 5408b4faaf70..758945c7ade5 100644
--- a/test/Transforms/CodeExtractor/PartialInlineOr.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineOr.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
-; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
diff --git a/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll b/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
index 282d300fadb9..fb6d1c335361 100644
--- a/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
@@ -1,7 +1,7 @@
; RUN: opt < %s -partial-inliner -S | FileCheck %s
; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s
-; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=LIMIT3 %s
+; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=LIMIT3 %s
; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s
; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s
diff --git a/test/Transforms/CodeExtractor/SingleCondition.ll b/test/Transforms/CodeExtractor/SingleCondition.ll
index 90cda889a21b..4110cd95b7ee 100644
--- a/test/Transforms/CodeExtractor/SingleCondition.ll
+++ b/test/Transforms/CodeExtractor/SingleCondition.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
-; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S | FileCheck %s
define internal i32 @inlinedFunc(i1 %cond, i32* align 4 %align.val) {
entry:
diff --git a/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll b/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
index 41d883c8c378..0f8a71907d85 100644
--- a/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
+++ b/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner | llc -filetype=null
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis | llc -filetype=null
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
; This testcase checks to see if CodeExtractor properly inherits
; target specific attributes for the extracted function. This can
; cause certain instructions that depend on the attributes to not
diff --git a/test/Transforms/CodeGenPrepare/section-samplepgo.ll b/test/Transforms/CodeGenPrepare/section-samplepgo.ll
new file mode 100644
index 000000000000..93d2a5f2542c
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/section-samplepgo.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -codegenprepare -S | FileCheck %s
+
+target triple = "x86_64-pc-linux-gnu"
+
+; This tests that hot/cold functions get correct section prefix assigned
+
+; CHECK: hot_func{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
+; The entry is hot
+define void @hot_func() !prof !15 {
+ ret void
+}
+
+; CHECK: hot_call_func{{.*}}!section_prefix ![[HOT_ID]]
+; The sum of 2 callsites are hot
+define void @hot_call_func() !prof !16 {
+ call void @hot_func(), !prof !17
+ call void @hot_func(), !prof !17
+ ret void
+}
+
+; CHECK-NOT: normal_func{{.*}}!section_prefix
+; The sum of all callsites are neither hot or cold
+define void @normal_func() !prof !16 {
+ call void @hot_func(), !prof !17
+ call void @hot_func(), !prof !18
+ call void @hot_func(), !prof !18
+ ret void
+}
+
+; CHECK: cold_func{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
+; The entry and the callsite are both cold
+define void @cold_func() !prof !16 {
+ call void @hot_func(), !prof !18
+ ret void
+}
+
+; CHECK: ![[HOT_ID]] = !{!"function_section_prefix", !".hot"}
+; CHECK: ![[COLD_ID]] = !{!"function_section_prefix", !".unlikely"}
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"SampleProfile"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"function_entry_count", i64 1000}
+!16 = !{!"function_entry_count", i64 1}
+!17 = !{!"branch_weights", i32 80}
+!18 = !{!"branch_weights", i32 1}
diff --git a/test/Transforms/CodeGenPrepare/section.ll b/test/Transforms/CodeGenPrepare/section.ll
index 2c96612e1baf..4f3144e7fc73 100644
--- a/test/Transforms/CodeGenPrepare/section.ll
+++ b/test/Transforms/CodeGenPrepare/section.ll
@@ -10,32 +10,32 @@ define void @hot_func() !prof !15 {
ret void
}
-; CHECK: hot_call_func{{.*}}!section_prefix ![[HOT_ID]]
-; The sum of 2 callsites are hot
-define void @hot_call_func() !prof !16 {
+; For instrumentation based PGO, we should only look at entry counts,
+; not call site VP metadata (which can exist on value profiled memcpy,
+; or possibly left behind after static analysis based devirtualization).
+; CHECK: cold_func1{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
+define void @cold_func1() !prof !16 {
call void @hot_func(), !prof !17
call void @hot_func(), !prof !17
ret void
}
-; CHECK-NOT: normal_func{{.*}}!section_prefix
-; The sum of all callsites are neither hot or cold
-define void @normal_func() !prof !16 {
+; CHECK: cold_func2{{.*}}!section_prefix
+define void @cold_func2() !prof !16 {
call void @hot_func(), !prof !17
call void @hot_func(), !prof !18
call void @hot_func(), !prof !18
ret void
}
-; CHECK: cold_func{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
-; The entry and the callsite are both cold
-define void @cold_func() !prof !16 {
+; CHECK: cold_func3{{.*}}!section_prefix ![[COLD_ID]]
+define void @cold_func3() !prof !16 {
call void @hot_func(), !prof !18
ret void
}
; CHECK: ![[HOT_ID]] = !{!"function_section_prefix", !".hot"}
-; CHECK: ![[COLD_ID]] = !{!"function_section_prefix", !".cold"}
+; CHECK: ![[COLD_ID]] = !{!"function_section_prefix", !".unlikely"}
!llvm.module.flags = !{!1}
!1 = !{i32 1, !"ProfileSummary", !2}
!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
diff --git a/test/Transforms/ConstProp/calls-math-finite.ll b/test/Transforms/ConstProp/calls-math-finite.ll
new file mode 100644
index 000000000000..00041f3e4a4b
--- /dev/null
+++ b/test/Transforms/ConstProp/calls-math-finite.ll
@@ -0,0 +1,83 @@
+; RUN: opt < %s -constprop -S | FileCheck %s
+
+; Test to verify constant folding can occur when math
+; routines are mapped to the __<func>_finite versions
+; of functions due to __FINITE_MATH_ONLY__ being
+; enabled on headers. All calls should constant
+; fold away in this test.
+
+declare double @__acos_finite(double) #0
+declare float @__acosf_finite(float) #0
+declare double @__asin_finite(double) #0
+declare float @__asinf_finite(float) #0
+declare double @__atan2_finite(double, double) #0
+declare float @__atan2f_finite(float, float) #0
+declare double @__cosh_finite(double) #0
+declare float @__coshf_finite(float) #0
+declare double @__exp2_finite(double) #0
+declare float @__exp2f_finite(float) #0
+declare double @__exp_finite(double) #0
+declare float @__expf_finite(float) #0
+declare double @__log10_finite(double) #0
+declare float @__log10f_finite(float) #0
+declare double @__log_finite(double) #0
+declare float @__logf_finite(float) #0
+declare double @__pow_finite(double, double) #0
+declare float @__powf_finite(float, float) #0
+declare double @__sinh_finite(double) #0
+declare float @__sinhf_finite(float) #0
+
+attributes #0 = { nounwind readnone }
+
+define void @T() {
+; CHECK-LABEL: @T(
+
+; CHECK-NOT: call
+; CHECK: ret
+
+ %slot = alloca double
+ %slotf = alloca float
+
+ %ACOS = call fast double @__acos_finite(double 1.000000e+00)
+ store double %ACOS, double* %slot
+ %ASIN = call fast double @__asin_finite(double 1.000000e+00)
+ store double %ASIN, double* %slot
+ %ATAN2 = call fast double @__atan2_finite(double 3.000000e+00, double 4.000000e+00)
+ store double %ATAN2, double* %slot
+ %COSH = call fast double @__cosh_finite(double 3.000000e+00)
+ store double %COSH, double* %slot
+ %EXP = call fast double @__exp_finite(double 3.000000e+00)
+ store double %EXP, double* %slot
+ %EXP2 = call fast double @__exp2_finite(double 3.000000e+00)
+ store double %EXP2, double* %slot
+ %LOG = call fast double @__log_finite(double 3.000000e+00)
+ store double %LOG, double* %slot
+ %LOG10 = call fast double @__log10_finite(double 3.000000e+00)
+ store double %LOG10, double* %slot
+ %POW = call fast double @__pow_finite(double 1.000000e+00, double 4.000000e+00)
+ store double %POW, double* %slot
+ %SINH = call fast double @__sinh_finite(double 3.000000e+00)
+ store double %SINH, double* %slot
+
+ %ACOSF = call fast float @__acosf_finite(float 1.000000e+00)
+ store float %ACOSF, float* %slotf
+ %ASINF = call fast float @__asinf_finite(float 1.000000e+00)
+ store float %ASINF, float* %slotf
+ %ATAN2F = call fast float @__atan2f_finite(float 3.000000e+00, float 4.000000e+00)
+ store float %ATAN2F, float* %slotf
+ %COSHF = call fast float @__coshf_finite(float 3.000000e+00)
+ store float %COSHF, float* %slotf
+ %EXPF = call fast float @__expf_finite(float 3.000000e+00)
+ store float %EXPF, float* %slotf
+ %EXP2F = call fast float @__exp2f_finite(float 3.000000e+00)
+ store float %EXP2F, float* %slotf
+ %LOGF = call fast float @__logf_finite(float 3.000000e+00)
+ store float %LOGF, float* %slotf
+ %LOG10F = call fast float @__log10f_finite(float 3.000000e+00)
+ store float %LOG10F, float* %slotf
+ %POWF = call fast float @__powf_finite(float 3.000000e+00, float 4.000000e+00)
+ store float %POWF, float* %slotf
+ %SINHF = call fast float @__sinhf_finite(float 3.000000e+00)
+ store float %SINHF, float* %slotf
+ ret void
+}
diff --git a/test/Transforms/ConstProp/calls.ll b/test/Transforms/ConstProp/calls.ll
index 1175ea522175..161637cc92b8 100644
--- a/test/Transforms/ConstProp/calls.ll
+++ b/test/Transforms/ConstProp/calls.ll
@@ -184,212 +184,6 @@ define double @T() {
ret double %d
}
-define i1 @test_sse_cvts_exact() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvts_exact(
-; CHECK-NOT: call
-; CHECK: ret i1 true
-entry:
- %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
- %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
- %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 7.0, double undef>) nounwind
- %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 7.0, double undef>) nounwind
- %sum02 = add i32 %i0, %i2
- %sum13 = add i64 %i1, %i3
- %cmp02 = icmp eq i32 %sum02, 10
- %cmp13 = icmp eq i64 %sum13, 10
- %b = and i1 %cmp02, %cmp13
- ret i1 %b
-}
-
-; Inexact values should not fold as they are dependent on rounding mode
-define i1 @test_sse_cvts_inexact() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvts_inexact(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
- %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
- %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
- %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 1.75, double undef>) nounwind
- %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 1.75, double undef>) nounwind
- %sum02 = add i32 %i0, %i2
- %sum13 = add i64 %i1, %i3
- %cmp02 = icmp eq i32 %sum02, 4
- %cmp13 = icmp eq i64 %sum13, 4
- %b = and i1 %cmp02, %cmp13
- ret i1 %b
-}
-
-; FLT_MAX/DBL_MAX should not fold
-define i1 @test_sse_cvts_max() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvts_max(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
- %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float>
- %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double>
- %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
- %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
- %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
- %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
- %sum02 = add i32 %i0, %i2
- %sum13 = add i64 %i1, %i3
- %sum02.sext = sext i32 %sum02 to i64
- %b = icmp eq i64 %sum02.sext, %sum13
- ret i1 %b
-}
-
-; INF should not fold
-define i1 @test_sse_cvts_inf() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvts_inf(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
- %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float>
- %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double>
- %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
- %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
- %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
- %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
- %sum02 = add i32 %i0, %i2
- %sum13 = add i64 %i1, %i3
- %sum02.sext = sext i32 %sum02 to i64
- %b = icmp eq i64 %sum02.sext, %sum13
- ret i1 %b
-}
-
-; NAN should not fold
-define i1 @test_sse_cvts_nan() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvts_nan(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
- %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float>
- %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double>
- %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
- %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
- %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
- %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
- %sum02 = add i32 %i0, %i2
- %sum13 = add i64 %i1, %i3
- %sum02.sext = sext i32 %sum02 to i64
- %b = icmp eq i64 %sum02.sext, %sum13
- ret i1 %b
-}
-
-define i1 @test_sse_cvtts_exact() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvtts_exact(
-; CHECK-NOT: call
-; CHECK: ret i1 true
-entry:
- %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
- %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
- %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 7.0, double undef>) nounwind
- %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 7.0, double undef>) nounwind
- %sum02 = add i32 %i0, %i2
- %sum13 = add i64 %i1, %i3
- %cmp02 = icmp eq i32 %sum02, 10
- %cmp13 = icmp eq i64 %sum13, 10
- %b = and i1 %cmp02, %cmp13
- ret i1 %b
-}
-
-define i1 @test_sse_cvtts_inexact() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvtts_inexact(
-; CHECK-NOT: call
-; CHECK: ret i1 true
-entry:
- %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
- %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
- %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 1.75, double undef>) nounwind
- %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 1.75, double undef>) nounwind
- %sum02 = add i32 %i0, %i2
- %sum13 = add i64 %i1, %i3
- %cmp02 = icmp eq i32 %sum02, 2
- %cmp13 = icmp eq i64 %sum13, 2
- %b = and i1 %cmp02, %cmp13
- ret i1 %b
-}
-
-; FLT_MAX/DBL_MAX should not fold
-define i1 @test_sse_cvtts_max() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvtts_max(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
- %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float>
- %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double>
- %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
- %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
- %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
- %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
- %sum02 = add i32 %i0, %i2
- %sum13 = add i64 %i1, %i3
- %sum02.sext = sext i32 %sum02 to i64
- %b = icmp eq i64 %sum02.sext, %sum13
- ret i1 %b
-}
-
-; INF should not fold
-define i1 @test_sse_cvtts_inf() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvtts_inf(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
- %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float>
- %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double>
- %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
- %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
- %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
- %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
- %sum02 = add i32 %i0, %i2
- %sum13 = add i64 %i1, %i3
- %sum02.sext = sext i32 %sum02 to i64
- %b = icmp eq i64 %sum02.sext, %sum13
- ret i1 %b
-}
-
-; NAN should not fold
-define i1 @test_sse_cvtts_nan() nounwind readnone {
-; CHECK-LABEL: @test_sse_cvtts_nan(
-; CHECK: call
-; CHECK: call
-; CHECK: call
-; CHECK: call
-entry:
- %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float>
- %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double>
- %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
- %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
- %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
- %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
- %sum02 = add i32 %i0, %i2
- %sum13 = add i64 %i1, %i3
- %sum02.sext = sext i32 %sum02 to i64
- %b = icmp eq i64 %sum02.sext, %sum13
- ret i1 %b
-}
-
-declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
-declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
-declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
-declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
-declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
-declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
-declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
-declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
-
define double @test_intrinsic_pow() nounwind uwtable ssp {
entry:
; CHECK-LABEL: @test_intrinsic_pow(
diff --git a/test/Transforms/ConstProp/sse.ll b/test/Transforms/ConstProp/sse.ll
new file mode 100644
index 000000000000..cc37c96c1ff1
--- /dev/null
+++ b/test/Transforms/ConstProp/sse.ll
@@ -0,0 +1,208 @@
+; RUN: opt < %s -constprop -S | FileCheck %s
+; REQUIRES: x86
+
+define i1 @test_sse_cvts_exact() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvts_exact(
+; CHECK-NOT: call
+; CHECK: ret i1 true
+entry:
+ %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
+ %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
+ %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 7.0, double undef>) nounwind
+ %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 7.0, double undef>) nounwind
+ %sum02 = add i32 %i0, %i2
+ %sum13 = add i64 %i1, %i3
+ %cmp02 = icmp eq i32 %sum02, 10
+ %cmp13 = icmp eq i64 %sum13, 10
+ %b = and i1 %cmp02, %cmp13
+ ret i1 %b
+}
+
+; Inexact values should not fold as they are dependent on rounding mode
+define i1 @test_sse_cvts_inexact() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvts_inexact(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+ %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+ %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+ %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 1.75, double undef>) nounwind
+ %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 1.75, double undef>) nounwind
+ %sum02 = add i32 %i0, %i2
+ %sum13 = add i64 %i1, %i3
+ %cmp02 = icmp eq i32 %sum02, 4
+ %cmp13 = icmp eq i64 %sum13, 4
+ %b = and i1 %cmp02, %cmp13
+ ret i1 %b
+}
+
+; FLT_MAX/DBL_MAX should not fold
+define i1 @test_sse_cvts_max() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvts_max(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+ %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float>
+ %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double>
+ %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
+ %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
+ %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
+ %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
+ %sum02 = add i32 %i0, %i2
+ %sum13 = add i64 %i1, %i3
+ %sum02.sext = sext i32 %sum02 to i64
+ %b = icmp eq i64 %sum02.sext, %sum13
+ ret i1 %b
+}
+
+; INF should not fold
+define i1 @test_sse_cvts_inf() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvts_inf(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+ %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float>
+ %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double>
+ %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
+ %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
+ %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
+ %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
+ %sum02 = add i32 %i0, %i2
+ %sum13 = add i64 %i1, %i3
+ %sum02.sext = sext i32 %sum02 to i64
+ %b = icmp eq i64 %sum02.sext, %sum13
+ ret i1 %b
+}
+
+; NAN should not fold
+define i1 @test_sse_cvts_nan() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvts_nan(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+ %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float>
+ %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double>
+ %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind
+ %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind
+ %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind
+ %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind
+ %sum02 = add i32 %i0, %i2
+ %sum13 = add i64 %i1, %i3
+ %sum02.sext = sext i32 %sum02 to i64
+ %b = icmp eq i64 %sum02.sext, %sum13
+ ret i1 %b
+}
+
+define i1 @test_sse_cvtts_exact() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvtts_exact(
+; CHECK-NOT: call
+; CHECK: ret i1 true
+entry:
+ %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
+ %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind
+ %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 7.0, double undef>) nounwind
+ %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 7.0, double undef>) nounwind
+ %sum02 = add i32 %i0, %i2
+ %sum13 = add i64 %i1, %i3
+ %cmp02 = icmp eq i32 %sum02, 10
+ %cmp13 = icmp eq i64 %sum13, 10
+ %b = and i1 %cmp02, %cmp13
+ ret i1 %b
+}
+
+define i1 @test_sse_cvtts_inexact() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvtts_inexact(
+; CHECK-NOT: call
+; CHECK: ret i1 true
+entry:
+ %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+ %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind
+ %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 1.75, double undef>) nounwind
+ %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 1.75, double undef>) nounwind
+ %sum02 = add i32 %i0, %i2
+ %sum13 = add i64 %i1, %i3
+ %cmp02 = icmp eq i32 %sum02, 2
+ %cmp13 = icmp eq i64 %sum13, 2
+ %b = and i1 %cmp02, %cmp13
+ ret i1 %b
+}
+
+; FLT_MAX/DBL_MAX should not fold
+define i1 @test_sse_cvtts_max() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvtts_max(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+ %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float>
+ %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double>
+ %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
+ %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
+ %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
+ %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
+ %sum02 = add i32 %i0, %i2
+ %sum13 = add i64 %i1, %i3
+ %sum02.sext = sext i32 %sum02 to i64
+ %b = icmp eq i64 %sum02.sext, %sum13
+ ret i1 %b
+}
+
+; INF should not fold
+define i1 @test_sse_cvtts_inf() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvtts_inf(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+ %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float>
+ %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double>
+ %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
+ %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
+ %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
+ %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
+ %sum02 = add i32 %i0, %i2
+ %sum13 = add i64 %i1, %i3
+ %sum02.sext = sext i32 %sum02 to i64
+ %b = icmp eq i64 %sum02.sext, %sum13
+ ret i1 %b
+}
+
+; NAN should not fold
+define i1 @test_sse_cvtts_nan() nounwind readnone {
+; CHECK-LABEL: @test_sse_cvtts_nan(
+; CHECK: call
+; CHECK: call
+; CHECK: call
+; CHECK: call
+entry:
+ %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float>
+ %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double>
+ %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind
+ %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind
+ %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind
+ %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind
+ %sum02 = add i32 %i0, %i2
+ %sum13 = add i64 %i1, %i3
+ %sum02.sext = sext i32 %sum02 to i64
+ %b = icmp eq i64 %sum02.sext, %sum13
+ ret i1 %b
+}
+
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
diff --git a/test/Transforms/Coroutines/coro-eh-aware-edge-split.ll b/test/Transforms/Coroutines/coro-eh-aware-edge-split.ll
new file mode 100644
index 000000000000..5da0e3c199db
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-eh-aware-edge-split.ll
@@ -0,0 +1,218 @@
+; Check that we can handle edge splits leading into a landingpad
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: define internal fastcc void @f.resume(
+define void @f(i1 %cond) "coroutine.presplit"="1" personality i32 0 {
+entry:
+ %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null)
+ %size = tail call i64 @llvm.coro.size.i64()
+ %alloc = call i8* @malloc(i64 %size)
+ %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+ %sp = call i8 @llvm.coro.suspend(token none, i1 false)
+ switch i8 %sp, label %coro.ret [
+ i8 0, label %resume
+ i8 1, label %cleanup
+ ]
+
+resume:
+ br i1 %cond, label %invoke1, label %invoke2
+
+invoke1:
+ invoke void @may_throw1()
+ to label %unreach unwind label %pad.with.phi
+invoke2:
+ invoke void @may_throw2()
+ to label %unreach unwind label %pad.with.phi
+
+; Verify that we cloned landing pad on every edge and inserted a reload of the spilled value
+
+; CHECK: pad.with.phi.from.invoke2:
+; CHECK: %0 = landingpad { i8*, i32 }
+; CHECK: catch i8* null
+; CHECK: br label %pad.with.phi
+
+; CHECK: pad.with.phi.from.invoke1:
+; CHECK: %1 = landingpad { i8*, i32 }
+; CHECK: catch i8* null
+; CHECK: br label %pad.with.phi
+
+; CHECK: pad.with.phi:
+; CHECK: %val = phi i32 [ 0, %pad.with.phi.from.invoke1 ], [ 1, %pad.with.phi.from.invoke2 ]
+; CHECK: %lp = phi { i8*, i32 } [ %0, %pad.with.phi.from.invoke2 ], [ %1, %pad.with.phi.from.invoke1 ]
+; CHECK: %exn = extractvalue { i8*, i32 } %lp, 0
+; CHECK: call i8* @__cxa_begin_catch(i8* %exn)
+; CHECK: call void @use_val(i32 %val)
+; CHECK: call void @__cxa_end_catch()
+; CHECK: call void @free(i8* %vFrame)
+; CHECK: ret void
+
+pad.with.phi:
+ %val = phi i32 [ 0, %invoke1 ], [ 1, %invoke2 ]
+ %lp = landingpad { i8*, i32 }
+ catch i8* null
+ %exn = extractvalue { i8*, i32 } %lp, 0
+ call i8* @__cxa_begin_catch(i8* %exn)
+ call void @use_val(i32 %val)
+ call void @__cxa_end_catch()
+ br label %cleanup
+
+cleanup: ; preds = %invoke.cont15, %if.else, %if.then, %ehcleanup21, %init.suspend
+ %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+ call void @free(i8* %mem)
+ br label %coro.ret
+
+coro.ret:
+ call i1 @llvm.coro.end(i8* null, i1 false)
+ ret void
+
+unreach:
+ unreachable
+}
+
+; CHECK-LABEL: define internal fastcc void @g.resume(
+define void @g(i1 %cond, i32 %x, i32 %y) "coroutine.presplit"="1" personality i32 0 {
+entry:
+ %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null)
+ %size = tail call i64 @llvm.coro.size.i64()
+ %alloc = call i8* @malloc(i64 %size)
+ %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+ %sp = call i8 @llvm.coro.suspend(token none, i1 false)
+ switch i8 %sp, label %coro.ret [
+ i8 0, label %resume
+ i8 1, label %cleanup
+ ]
+
+resume:
+ br i1 %cond, label %invoke1, label %invoke2
+
+invoke1:
+ invoke void @may_throw1()
+ to label %unreach unwind label %pad.with.phi
+invoke2:
+ invoke void @may_throw2()
+ to label %unreach unwind label %pad.with.phi
+
+; Verify that we created cleanuppads on every edge and inserted a reload of the spilled value
+
+; CHECK: pad.with.phi.from.invoke2:
+; CHECK: %0 = cleanuppad within none []
+; CHECK: %y.reload.addr = getelementptr inbounds %g.Frame, %g.Frame* %FramePtr, i32 0, i32 6
+; CHECK: %y.reload = load i32, i32* %y.reload.addr
+; CHECK: cleanupret from %0 unwind label %pad.with.phi
+
+; CHECK: pad.with.phi.from.invoke1:
+; CHECK: %1 = cleanuppad within none []
+; CHECK: %x.reload.addr = getelementptr inbounds %g.Frame, %g.Frame* %FramePtr, i32 0, i32 5
+; CHECK: %x.reload = load i32, i32* %x.reload.addr
+; CHECK: cleanupret from %1 unwind label %pad.with.phi
+
+; CHECK: pad.with.phi:
+; CHECK: %val = phi i32 [ %x.reload, %pad.with.phi.from.invoke1 ], [ %y.reload, %pad.with.phi.from.invoke2 ]
+; CHECK: %tok = cleanuppad within none []
+; CHECK: call void @use_val(i32 %val)
+; CHECK: cleanupret from %tok unwind to caller
+
+pad.with.phi:
+ %val = phi i32 [ %x, %invoke1 ], [ %y, %invoke2 ]
+ %tok = cleanuppad within none []
+ call void @use_val(i32 %val)
+ cleanupret from %tok unwind to caller
+
+cleanup: ; preds = %invoke.cont15, %if.else, %if.then, %ehcleanup21, %init.suspend
+ %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+ call void @free(i8* %mem)
+ br label %coro.ret
+
+coro.ret:
+ call i1 @llvm.coro.end(i8* null, i1 false)
+ ret void
+
+unreach:
+ unreachable
+}
+
+; CHECK-LABEL: define internal fastcc void @h.resume(
+define void @h(i1 %cond, i32 %x, i32 %y) "coroutine.presplit"="1" personality i32 0 {
+entry:
+ %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null)
+ %size = tail call i64 @llvm.coro.size.i64()
+ %alloc = call i8* @malloc(i64 %size)
+ %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+ %sp = call i8 @llvm.coro.suspend(token none, i1 false)
+ switch i8 %sp, label %coro.ret [
+ i8 0, label %resume
+ i8 1, label %cleanup
+ ]
+
+resume:
+ br i1 %cond, label %invoke1, label %invoke2
+
+invoke1:
+ invoke void @may_throw1()
+ to label %coro.ret unwind label %pad.with.phi
+invoke2:
+ invoke void @may_throw2()
+ to label %coro.ret unwind label %pad.with.phi
+
+; Verify that we created cleanuppads on every edge and inserted a reload of the spilled value
+
+; CHECK: pad.with.phi.from.invoke2:
+; CHECK: %0 = cleanuppad within none []
+; CHECK: %y.reload.addr = getelementptr inbounds %h.Frame, %h.Frame* %FramePtr, i32 0, i32 6
+; CHECK: %y.reload = load i32, i32* %y.reload.addr
+; CHECK: cleanupret from %0 unwind label %pad.with.phi
+
+; CHECK: pad.with.phi.from.invoke1:
+; CHECK: %1 = cleanuppad within none []
+; CHECK: %x.reload.addr = getelementptr inbounds %h.Frame, %h.Frame* %FramePtr, i32 0, i32 5
+; CHECK: %x.reload = load i32, i32* %x.reload.addr
+; CHECK: cleanupret from %1 unwind label %pad.with.phi
+
+; CHECK: pad.with.phi:
+; CHECK: %val = phi i32 [ %x.reload, %pad.with.phi.from.invoke1 ], [ %y.reload, %pad.with.phi.from.invoke2 ]
+; CHECK: %switch = catchswitch within none [label %catch] unwind to caller
+pad.with.phi:
+ %val = phi i32 [ %x, %invoke1 ], [ %y, %invoke2 ]
+ %switch = catchswitch within none [label %catch] unwind to caller
+
+catch: ; preds = %catch.dispatch
+ %pad = catchpad within %switch [i8* null, i32 64, i8* null]
+ call void @use_val(i32 %val)
+ catchret from %pad to label %coro.ret
+
+cleanup: ; preds = %invoke.cont15, %if.else, %if.then, %ehcleanup21, %init.suspend
+ %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+ call void @free(i8* %mem)
+ br label %coro.ret
+
+coro.ret:
+ call i1 @llvm.coro.end(i8* null, i1 false)
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*)
+declare noalias i8* @malloc(i64)
+declare i64 @llvm.coro.size.i64()
+declare i8* @llvm.coro.begin(token, i8* writeonly)
+
+; Function Attrs: nounwind
+declare token @llvm.coro.save(i8*)
+declare i8 @llvm.coro.suspend(token, i1)
+
+; Function Attrs: argmemonly nounwind
+declare void @may_throw1()
+declare void @may_throw2()
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @use_val(i32)
+declare void @__cxa_end_catch()
+
+; Function Attrs: nounwind
+declare i1 @llvm.coro.end(i8*, i1)
+declare void @free(i8*)
+declare i8* @llvm.coro.free(token, i8* nocapture readonly)
diff --git a/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll b/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll
index 0769575759ba..05dc79db95ad 100644
--- a/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll
+++ b/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll
@@ -5,8 +5,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-macosx10.7.0"
-
-define i1 @rb_intern() nounwind ssp {
+define i1 @rb_intern(i8 *%foo) nounwind ssp {
; CHECK-LABEL: @rb_intern(
bb:
@@ -19,7 +18,7 @@ bb1:
br i1 undef, label %bb3, label %bb15
; CHECK: bb1:
-; CHECK: [[TMP:%.*]] = phi i8* [ getelementptr (i8, i8* null, i64 undef), %bb10 ], [ null, %bb ]
+; CHECK: [[TMP:%.*]] = phi i8* [ %tmp14, %bb10 ], [ null, %bb ]
; CHECK: bb1.bb15_crit_edge:
; CHECK: %tmp17.pre = load i8, i8* [[TMP]], align 1
@@ -41,7 +40,7 @@ bb10:
%tmp11 = load i8*, i8** %tmp, align 8
%tmp12 = load i8, i8* %tmp11, align 1
%tmp13 = zext i8 %tmp12 to i64
- %tmp14 = getelementptr inbounds i8, i8* null, i64 undef
+ %tmp14 = getelementptr inbounds i8, i8* %foo, i64 undef
store i8* %tmp14, i8** %tmp, align 8
br label %bb1
diff --git a/test/Transforms/GVN/PRE/nonintegral.ll b/test/Transforms/GVN/PRE/nonintegral.ll
new file mode 100644
index 000000000000..75a756e8af8c
--- /dev/null
+++ b/test/Transforms/GVN/PRE/nonintegral.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -gvn -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:4"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @nipre(double addrspace(4)** noalias %p, i64 addrspace(4)** noalias %p2, i8 %jmp) {
+
+; CHECK-LABEL: @nipre(
+; CHECK: [[PCAST:%.*]] = bitcast double addrspace(4)** [[P:%.*]] to i64 addrspace(4)**
+; CHECK: a:
+; CHECK: [[L1:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)** [[PCAST]]
+; CHECK: [[TMP0:%.*]] = bitcast i64 addrspace(4)* [[L1]] to double addrspace(4)*
+; CHECK: b:
+; CHECK: [[L2:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)** [[PCAST]]
+; CHECK: [[TMP1:%.*]] = bitcast i64 addrspace(4)* [[L2]] to double addrspace(4)*
+; CHECK: c:
+; CHECK-NEXT: [[L3_PRE:%.*]] = load double addrspace(4)*, double addrspace(4)** %p
+
+entry:
+ %pcast = bitcast double addrspace(4)** %p to i64 addrspace(4)**
+ switch i8 %jmp, label %c [ i8 0, label %a
+ i8 1, label %b]
+a:
+ %l1 = load i64 addrspace(4)*, i64 addrspace(4)** %pcast
+ store i64 addrspace(4)* %l1, i64 addrspace(4)** %p2
+ br label %tail
+b:
+ %l2 = load i64 addrspace(4)*, i64 addrspace(4)** %pcast
+ store i64 addrspace(4)* %l2, i64 addrspace(4)** %p2
+ br label %tail
+c:
+ br label %tail
+tail:
+ %l3 = load double addrspace(4)*, double addrspace(4)** %p
+ %l3cast = bitcast double addrspace(4)* %l3 to i64 addrspace(4)*
+ store i64 addrspace(4)* %l3cast, i64 addrspace(4)** %p2
+ ret void
+}
diff --git a/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll b/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll
index 3d77a364f96f..49e5d24296c0 100644
--- a/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll
+++ b/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll
@@ -6,7 +6,7 @@ target triple = "thumbv7-apple-darwin"
; CHECK-LABEL: @test(
; CHECK: if.end.i126:
-; CHECK: %exitcond = icmp ne i8* %incdec.ptr.i, getelementptr (i8, i8* null, i32 undef)
+; CHECK: %exitcond = icmp ne i8* %incdec.ptr.i, null
define void @test() nounwind {
entry:
br label %while.cond
diff --git a/test/Transforms/InferFunctionAttrs/annotate.ll b/test/Transforms/InferFunctionAttrs/annotate.ll
index 64676bf310bd..cb4b5cdd1e8c 100644
--- a/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -22,12 +22,138 @@ declare i32 @__nvvm_reflect(i8*)
; Use an opaque pointer type for all the (possibly opaque) structs.
%opaque = type opaque
+; CHECK: declare double @__acos_finite(double)
+declare double @__acos_finite(double)
+
+; CHECK: declare float @__acosf_finite(float)
+declare float @__acosf_finite(float)
+
+; CHECK: declare double @__acosh_finite(double)
+declare double @__acosh_finite(double)
+
+; CHECK: declare float @__acoshf_finite(float)
+declare float @__acoshf_finite(float)
+
+; CHECK: declare x86_fp80 @__acoshl_finite(x86_fp80)
+declare x86_fp80 @__acoshl_finite(x86_fp80)
+
+; CHECK: declare x86_fp80 @__acosl_finite(x86_fp80)
+declare x86_fp80 @__acosl_finite(x86_fp80)
+
+; CHECK: declare double @__asin_finite(double)
+declare double @__asin_finite(double)
+
+; CHECK: declare float @__asinf_finite(float)
+declare float @__asinf_finite(float)
+
+; CHECK: declare x86_fp80 @__asinl_finite(x86_fp80)
+declare x86_fp80 @__asinl_finite(x86_fp80)
+
+; CHECK: declare double @__atan2_finite(double, double)
+declare double @__atan2_finite(double, double)
+
+; CHECK: declare float @__atan2f_finite(float, float)
+declare float @__atan2f_finite(float, float)
+
+; CHECK: declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80)
+declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80)
+
+; CHECK: declare double @__atanh_finite(double)
+declare double @__atanh_finite(double)
+
+; CHECK: declare float @__atanhf_finite(float)
+declare float @__atanhf_finite(float)
+
+; CHECK: declare x86_fp80 @__atanhl_finite(x86_fp80)
+declare x86_fp80 @__atanhl_finite(x86_fp80)
+
+; CHECK: declare double @__cosh_finite(double)
+declare double @__cosh_finite(double)
+
+; CHECK: declare float @__coshf_finite(float)
+declare float @__coshf_finite(float)
+
+; CHECK: declare x86_fp80 @__coshl_finite(x86_fp80)
+declare x86_fp80 @__coshl_finite(x86_fp80)
+
; CHECK: declare double @__cospi(double)
declare double @__cospi(double)
; CHECK: declare float @__cospif(float)
declare float @__cospif(float)
+; CHECK: declare double @__exp10_finite(double)
+declare double @__exp10_finite(double)
+
+; CHECK: declare float @__exp10f_finite(float)
+declare float @__exp10f_finite(float)
+
+; CHECK: declare x86_fp80 @__exp10l_finite(x86_fp80)
+declare x86_fp80 @__exp10l_finite(x86_fp80)
+
+; CHECK: declare double @__exp2_finite(double)
+declare double @__exp2_finite(double)
+
+; CHECK: declare float @__exp2f_finite(float)
+declare float @__exp2f_finite(float)
+
+; CHECK: declare x86_fp80 @__exp2l_finite(x86_fp80)
+declare x86_fp80 @__exp2l_finite(x86_fp80)
+
+; CHECK: declare double @__exp_finite(double)
+declare double @__exp_finite(double)
+
+; CHECK: declare float @__expf_finite(float)
+declare float @__expf_finite(float)
+
+; CHECK: declare x86_fp80 @__expl_finite(x86_fp80)
+declare x86_fp80 @__expl_finite(x86_fp80)
+
+; CHECK: declare double @__log10_finite(double)
+declare double @__log10_finite(double)
+
+; CHECK: declare float @__log10f_finite(float)
+declare float @__log10f_finite(float)
+
+; CHECK: declare x86_fp80 @__log10l_finite(x86_fp80)
+declare x86_fp80 @__log10l_finite(x86_fp80)
+
+; CHECK: declare double @__log2_finite(double)
+declare double @__log2_finite(double)
+
+; CHECK: declare float @__log2f_finite(float)
+declare float @__log2f_finite(float)
+
+; CHECK: declare x86_fp80 @__log2l_finite(x86_fp80)
+declare x86_fp80 @__log2l_finite(x86_fp80)
+
+; CHECK: declare double @__log_finite(double)
+declare double @__log_finite(double)
+
+; CHECK: declare float @__logf_finite(float)
+declare float @__logf_finite(float)
+
+; CHECK: declare x86_fp80 @__logl_finite(x86_fp80)
+declare x86_fp80 @__logl_finite(x86_fp80)
+
+; CHECK: declare double @__pow_finite(double, double)
+declare double @__pow_finite(double, double)
+
+; CHECK: declare float @__powf_finite(float, float)
+declare float @__powf_finite(float, float)
+
+; CHECK: declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80)
+declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80)
+
+; CHECK: declare double @__sinh_finite(double)
+declare double @__sinh_finite(double)
+
+; CHECK: declare float @__sinhf_finite(float)
+declare float @__sinhf_finite(float)
+
+; CHECK: declare x86_fp80 @__sinhl_finite(x86_fp80)
+declare x86_fp80 @__sinhl_finite(x86_fp80)
+
; CHECK: declare double @__sinpi(double)
declare double @__sinpi(double)
diff --git a/test/Transforms/InferFunctionAttrs/no-proto.ll b/test/Transforms/InferFunctionAttrs/no-proto.ll
index 25a4805c367f..3cab0ab4bf40 100644
--- a/test/Transforms/InferFunctionAttrs/no-proto.ll
+++ b/test/Transforms/InferFunctionAttrs/no-proto.ll
@@ -3,12 +3,138 @@
; Check that we don't modify libc functions with invalid prototypes.
+; CHECK: declare void @__acos_finite(...)
+declare void @__acos_finite(...)
+
+; CHECK: declare void @__acosf_finite(...)
+declare void @__acosf_finite(...)
+
+; CHECK: declare void @__acosh_finite(...)
+declare void @__acosh_finite(...)
+
+; CHECK: declare void @__acoshf_finite(...)
+declare void @__acoshf_finite(...)
+
+; CHECK: declare void @__acoshl_finite(...)
+declare void @__acoshl_finite(...)
+
+; CHECK: declare void @__acosl_finite(...)
+declare void @__acosl_finite(...)
+
+; CHECK: declare void @__asin_finite(...)
+declare void @__asin_finite(...)
+
+; CHECK: declare void @__asinf_finite(...)
+declare void @__asinf_finite(...)
+
+; CHECK: declare void @__asinl_finite(...)
+declare void @__asinl_finite(...)
+
+; CHECK: declare void @__atan2_finite(...)
+declare void @__atan2_finite(...)
+
+; CHECK: declare void @__atan2f_finite(...)
+declare void @__atan2f_finite(...)
+
+; CHECK: declare void @__atan2l_finite(...)
+declare void @__atan2l_finite(...)
+
+; CHECK: declare void @__atanh_finite(...)
+declare void @__atanh_finite(...)
+
+; CHECK: declare void @__atanhf_finite(...)
+declare void @__atanhf_finite(...)
+
+; CHECK: declare void @__atanhl_finite(...)
+declare void @__atanhl_finite(...)
+
+; CHECK: declare void @__cosh_finite(...)
+declare void @__cosh_finite(...)
+
+; CHECK: declare void @__coshf_finite(...)
+declare void @__coshf_finite(...)
+
+; CHECK: declare void @__coshl_finite(...)
+declare void @__coshl_finite(...)
+
; CHECK: declare void @__cospi(...)
declare void @__cospi(...)
; CHECK: declare void @__cospif(...)
declare void @__cospif(...)
+; CHECK: declare void @__exp10_finite(...)
+declare void @__exp10_finite(...)
+
+; CHECK: declare void @__exp10f_finite(...)
+declare void @__exp10f_finite(...)
+
+; CHECK: declare void @__exp10l_finite(...)
+declare void @__exp10l_finite(...)
+
+; CHECK: declare void @__exp2_finite(...)
+declare void @__exp2_finite(...)
+
+; CHECK: declare void @__exp2f_finite(...)
+declare void @__exp2f_finite(...)
+
+; CHECK: declare void @__exp2l_finite(...)
+declare void @__exp2l_finite(...)
+
+; CHECK: declare void @__exp_finite(...)
+declare void @__exp_finite(...)
+
+; CHECK: declare void @__expf_finite(...)
+declare void @__expf_finite(...)
+
+; CHECK: declare void @__expl_finite(...)
+declare void @__expl_finite(...)
+
+; CHECK: declare void @__log10_finite(...)
+declare void @__log10_finite(...)
+
+; CHECK: declare void @__log10f_finite(...)
+declare void @__log10f_finite(...)
+
+; CHECK: declare void @__log10l_finite(...)
+declare void @__log10l_finite(...)
+
+; CHECK: declare void @__log2_finite(...)
+declare void @__log2_finite(...)
+
+; CHECK: declare void @__log2f_finite(...)
+declare void @__log2f_finite(...)
+
+; CHECK: declare void @__log2l_finite(...)
+declare void @__log2l_finite(...)
+
+; CHECK: declare void @__log_finite(...)
+declare void @__log_finite(...)
+
+; CHECK: declare void @__logf_finite(...)
+declare void @__logf_finite(...)
+
+; CHECK: declare void @__logl_finite(...)
+declare void @__logl_finite(...)
+
+; CHECK: declare void @__pow_finite(...)
+declare void @__pow_finite(...)
+
+; CHECK: declare void @__powf_finite(...)
+declare void @__powf_finite(...)
+
+; CHECK: declare void @__powl_finite(...)
+declare void @__powl_finite(...)
+
+; CHECK: declare void @__sinh_finite(...)
+declare void @__sinh_finite(...)
+
+; CHECK: declare void @__sinhf_finite(...)
+declare void @__sinhf_finite(...)
+
+; CHECK: declare void @__sinhl_finite(...)
+declare void @__sinhl_finite(...)
+
; CHECK: declare void @__sinpi(...)
declare void @__sinpi(...)
diff --git a/test/Transforms/Inline/inline-cold.ll b/test/Transforms/Inline/inline-cold.ll
index 93d2569d87ad..e0e679ad4036 100644
--- a/test/Transforms/Inline/inline-cold.ll
+++ b/test/Transforms/Inline/inline-cold.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -inline -S -inlinecold-threshold=75 | FileCheck %s
+; RUN: opt < %s -inline -S -inlinecold-threshold=25 | FileCheck %s
; Test that functions with attribute Cold are not inlined while the
; same function without attribute Cold will be inlined.
@@ -64,23 +64,7 @@ entry:
%x3 = add i32 %x2, %a3
%a4 = load volatile i32, i32* @a
%x4 = add i32 %x3, %a4
- %a5 = load volatile i32, i32* @a
- %x5 = add i32 %x4, %a5
- %a6 = load volatile i32, i32* @a
- %x6 = add i32 %x5, %a6
- %a7 = load volatile i32, i32* @a
- %x7 = add i32 %x6, %a6
- %a8 = load volatile i32, i32* @a
- %x8 = add i32 %x7, %a8
- %a9 = load volatile i32, i32* @a
- %x9 = add i32 %x8, %a9
- %a10 = load volatile i32, i32* @a
- %x10 = add i32 %x9, %a10
- %a11 = load volatile i32, i32* @a
- %x11 = add i32 %x10, %a11
- %a12 = load volatile i32, i32* @a
- %x12 = add i32 %x11, %a12
- %add = add i32 %x12, %a
+ %add = add i32 %x4, %a
ret i32 %add
}
diff --git a/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll b/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll
index 1f2b143c97ee..b8d41abe1c35 100644
--- a/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll
+++ b/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p3:32:32-p4:64:64-n32"
@lds = internal addrspace(3) global [64 x i64] zeroinitializer
; CHECK-LABEL: @constexpr_addrspacecast_ptr_size_change(
-; CHECK: load i64, i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* addrspacecast (i64 addrspace(3)* getelementptr inbounds ([64 x i64], [64 x i64] addrspace(3)* @lds, i32 0, i32 0) to i64 addrspace(4)*), i64 undef)
+; CHECK: load i64, i64 addrspace(4)* addrspacecast (i64 addrspace(3)* getelementptr inbounds ([64 x i64], [64 x i64] addrspace(3)* @lds, i32 0, i32 0) to i64 addrspace(4)*)
; CHECK-NEXT: br
define void @constexpr_addrspacecast_ptr_size_change() #0 {
%tmp0 = call i32 @foo(i64 addrspace(4)* addrspacecast (i64 addrspace(3)* getelementptr inbounds ([64 x i64], [64 x i64] addrspace(3)* @lds, i32 0, i32 0) to i64 addrspace(4)*)) #1
diff --git a/test/Transforms/Inline/partial-inline-act.ll b/test/Transforms/Inline/partial-inline-act.ll
index 916436260bd6..27e719153875 100644
--- a/test/Transforms/Inline/partial-inline-act.ll
+++ b/test/Transforms/Inline/partial-inline-act.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -disable-output
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -disable-output
; This testcase tests the assumption cache
define internal i32 @inlinedFunc(i1 %cond, i32* align 4 %align.val) {
diff --git a/test/Transforms/Inline/prof-update.ll b/test/Transforms/Inline/prof-update.ll
index 3fefa1c56cea..4a4471e8e17a 100644
--- a/test/Transforms/Inline/prof-update.ll
+++ b/test/Transforms/Inline/prof-update.ll
@@ -6,21 +6,21 @@ declare void @ext1();
@func = global void ()* null
; CHECK: define void @callee(i32 %n) !prof ![[ENTRY_COUNT:[0-9]*]]
-define void @callee(i32 %n) !prof !1 {
+define void @callee(i32 %n) !prof !15 {
%cond = icmp sle i32 %n, 10
br i1 %cond, label %cond_true, label %cond_false
cond_true:
; ext1 is optimized away, thus not updated.
; CHECK: call void @ext1(), !prof ![[COUNT_CALLEE1:[0-9]*]]
- call void @ext1(), !prof !2
+ call void @ext1(), !prof !16
ret void
cond_false:
; ext is cloned and updated.
; CHECK: call void @ext(), !prof ![[COUNT_CALLEE:[0-9]*]]
- call void @ext(), !prof !2
+ call void @ext(), !prof !16
%f = load void ()*, void ()** @func
; CHECK: call void %f(), !prof ![[COUNT_IND_CALLEE:[0-9]*]]
- call void %f(), !prof !4
+ call void %f(), !prof !18
ret void
}
@@ -28,16 +28,29 @@ cond_false:
define void @caller() {
; CHECK: call void @ext(), !prof ![[COUNT_CALLER:[0-9]*]]
; CHECK: call void %f.i(), !prof ![[COUNT_IND_CALLER:[0-9]*]]
- call void @callee(i32 15), !prof !3
+ call void @callee(i32 15), !prof !17
ret void
}
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"MaxFunctionCount", i32 2000}
-!1 = !{!"function_entry_count", i64 1000}
-!2 = !{!"branch_weights", i64 2000}
-!3 = !{!"branch_weights", i64 400}
-!4 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"SampleProfile"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 10}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 2000}
+!8 = !{!"NumCounts", i64 2}
+!9 = !{!"NumFunctions", i64 2}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"function_entry_count", i64 1000}
+!16 = !{!"branch_weights", i64 2000}
+!17 = !{!"branch_weights", i64 400}
+!18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
attributes #0 = { alwaysinline }
; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i64 2000}
diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll
index 39408a2d394c..04fb7d91193a 100644
--- a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
+++ b/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll
@@ -1,70 +1,6 @@
; RUN: opt -S -instcombine < %s | FileCheck %s
-
-define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
-entry:
- %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
- ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> zeroinitializer
-}
-
-define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp {
-entry:
- %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
- ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> %a
-}
-
-define <4 x i32> @constantMul() nounwind readnone ssp {
-entry:
- %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
- ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-}
-
-define <4 x i32> @constantMulS() nounwind readnone ssp {
-entry:
- %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
- ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
-}
-
-define <4 x i32> @constantMulU() nounwind readnone ssp {
-entry:
- %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
- ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
-}
-
-define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp {
-entry:
- %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
- %b = add <4 x i32> zeroinitializer, %a
- ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
-; CHECK-NEXT: ret <4 x i32> %a
-}
-
-define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp {
-entry:
- %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
- %b = add <4 x i32> %x, %a
- ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
-; CHECK-NEXT: ret <4 x i32> %b
-}
-
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-
-; ARM64 variants - <rdar://problem/12349617>
+; ARM64 neon intrinsic variants - <rdar://problem/12349617>
+; REQUIRES: aarch64
define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp {
entry:
diff --git a/test/Transforms/InstCombine/AArch64/lit.local.cfg b/test/Transforms/InstCombine/AArch64/lit.local.cfg
new file mode 100644
index 000000000000..7184443994b6
--- /dev/null
+++ b/test/Transforms/InstCombine/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+ config.unsupported = True
diff --git a/test/Transforms/InstCombine/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index 1901997c5521..1901997c5521 100644
--- a/test/Transforms/InstCombine/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
diff --git a/test/Transforms/InstCombine/AMDGPU/lit.local.cfg b/test/Transforms/InstCombine/AMDGPU/lit.local.cfg
new file mode 100644
index 000000000000..2a665f06be72
--- /dev/null
+++ b/test/Transforms/InstCombine/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+ config.unsupported = True
diff --git a/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
new file mode 100644
index 000000000000..9efed367d19f
--- /dev/null
+++ b/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
+entry:
+ %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+ ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> zeroinitializer
+}
+
+define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp {
+entry:
+ %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+ ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @constantMul() nounwind readnone ssp {
+entry:
+ %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+ ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+}
+
+define <4 x i32> @constantMulS() nounwind readnone ssp {
+entry:
+ %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+ ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+}
+
+define <4 x i32> @constantMulU() nounwind readnone ssp {
+entry:
+ %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+ ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+}
+
+define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp {
+entry:
+ %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+ %b = add <4 x i32> zeroinitializer, %a
+ ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp {
+entry:
+ %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+ %b = add <4 x i32> %x, %a
+ ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT: ret <4 x i32> %b
+}
+
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
diff --git a/test/Transforms/InstCombine/constant-fold-hang.ll b/test/Transforms/InstCombine/ARM/constant-fold-hang.ll
index 2ca6b86ccc2f..2ca6b86ccc2f 100644
--- a/test/Transforms/InstCombine/constant-fold-hang.ll
+++ b/test/Transforms/InstCombine/ARM/constant-fold-hang.ll
diff --git a/test/Transforms/InstCombine/ARM/lit.local.cfg b/test/Transforms/InstCombine/ARM/lit.local.cfg
new file mode 100644
index 000000000000..236e1d344166
--- /dev/null
+++ b/test/Transforms/InstCombine/ARM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+ config.unsupported = True
diff --git a/test/Transforms/InstCombine/neon-intrinsics.ll b/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
index d22fa9c811dc..d22fa9c811dc 100644
--- a/test/Transforms/InstCombine/neon-intrinsics.ll
+++ b/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
diff --git a/test/Transforms/InstCombine/aligned-altivec.ll b/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll
index 10b4e4d62631..10b4e4d62631 100644
--- a/test/Transforms/InstCombine/aligned-altivec.ll
+++ b/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll
diff --git a/test/Transforms/InstCombine/aligned-qpx.ll b/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll
index e9710df5670c..e9710df5670c 100644
--- a/test/Transforms/InstCombine/aligned-qpx.ll
+++ b/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll
diff --git a/test/Transforms/InstCombine/PowerPC/lit.local.cfg b/test/Transforms/InstCombine/PowerPC/lit.local.cfg
new file mode 100644
index 000000000000..5d33887ff0a4
--- /dev/null
+++ b/test/Transforms/InstCombine/PowerPC/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'PowerPC' in config.root.targets:
+ config.unsupported = True
+
diff --git a/test/Transforms/InstCombine/vsx-unaligned.ll b/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll
index ad264fb15b31..ad264fb15b31 100644
--- a/test/Transforms/InstCombine/vsx-unaligned.ll
+++ b/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll
diff --git a/test/Transforms/InstCombine/X86FsubCmpCombine.ll b/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
index fde0692d00a2..fde0692d00a2 100644
--- a/test/Transforms/InstCombine/X86FsubCmpCombine.ll
+++ b/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
diff --git a/test/Transforms/InstCombine/blend_x86.ll b/test/Transforms/InstCombine/X86/blend_x86.ll
index 39ceb0186efe..39ceb0186efe 100644
--- a/test/Transforms/InstCombine/blend_x86.ll
+++ b/test/Transforms/InstCombine/X86/blend_x86.ll
diff --git a/test/Transforms/InstCombine/X86/lit.local.cfg b/test/Transforms/InstCombine/X86/lit.local.cfg
new file mode 100644
index 000000000000..c8625f4d9d24
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'X86' in config.root.targets:
+ config.unsupported = True
diff --git a/test/Transforms/InstCombine/pr2645-1.ll b/test/Transforms/InstCombine/X86/pr2645-1.ll
index 2986d21866bf..2986d21866bf 100644
--- a/test/Transforms/InstCombine/pr2645-1.ll
+++ b/test/Transforms/InstCombine/X86/pr2645-1.ll
diff --git a/test/Transforms/InstCombine/shufflemask-undef.ll b/test/Transforms/InstCombine/X86/shufflemask-undef.ll
index 10509a92941b..d95c42da5f7e 100644
--- a/test/Transforms/InstCombine/shufflemask-undef.ll
+++ b/test/Transforms/InstCombine/X86/shufflemask-undef.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -instcombine -S | not grep "shufflevector.*i32 8"
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK-NOT: shufflevector{{.*}}i32 8"
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin9"
diff --git a/test/Transforms/InstCombine/x86-avx2.ll b/test/Transforms/InstCombine/X86/x86-avx2.ll
index f4045f788e2d..f4045f788e2d 100644
--- a/test/Transforms/InstCombine/x86-avx2.ll
+++ b/test/Transforms/InstCombine/X86/x86-avx2.ll
diff --git a/test/Transforms/InstCombine/x86-avx512.ll b/test/Transforms/InstCombine/X86/x86-avx512.ll
index 2a24d93ce76a..2a24d93ce76a 100644
--- a/test/Transforms/InstCombine/x86-avx512.ll
+++ b/test/Transforms/InstCombine/X86/x86-avx512.ll
diff --git a/test/Transforms/InstCombine/x86-crc32-demanded.ll b/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll
index 878b97d1bb22..878b97d1bb22 100644
--- a/test/Transforms/InstCombine/x86-crc32-demanded.ll
+++ b/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll
diff --git a/test/Transforms/InstCombine/x86-f16c.ll b/test/Transforms/InstCombine/X86/x86-f16c.ll
index 6b5b6cb26eda..6b5b6cb26eda 100644
--- a/test/Transforms/InstCombine/x86-f16c.ll
+++ b/test/Transforms/InstCombine/X86/x86-f16c.ll
diff --git a/test/Transforms/InstCombine/x86-fma.ll b/test/Transforms/InstCombine/X86/x86-fma.ll
index 0d27d3276163..0d27d3276163 100644
--- a/test/Transforms/InstCombine/x86-fma.ll
+++ b/test/Transforms/InstCombine/X86/x86-fma.ll
diff --git a/test/Transforms/InstCombine/x86-insertps.ll b/test/Transforms/InstCombine/X86/x86-insertps.ll
index f55ea6f22d2e..f55ea6f22d2e 100644
--- a/test/Transforms/InstCombine/x86-insertps.ll
+++ b/test/Transforms/InstCombine/X86/x86-insertps.ll
diff --git a/test/Transforms/InstCombine/x86-masked-memops.ll b/test/Transforms/InstCombine/X86/x86-masked-memops.ll
index 8502b1899ecb..8502b1899ecb 100644
--- a/test/Transforms/InstCombine/x86-masked-memops.ll
+++ b/test/Transforms/InstCombine/X86/x86-masked-memops.ll
diff --git a/test/Transforms/InstCombine/x86-movmsk.ll b/test/Transforms/InstCombine/X86/x86-movmsk.ll
index 11acc1dbca84..11acc1dbca84 100644
--- a/test/Transforms/InstCombine/x86-movmsk.ll
+++ b/test/Transforms/InstCombine/X86/x86-movmsk.ll
diff --git a/test/Transforms/InstCombine/x86-muldq.ll b/test/Transforms/InstCombine/X86/x86-muldq.ll
index bcbb8919c403..bcbb8919c403 100644
--- a/test/Transforms/InstCombine/x86-muldq.ll
+++ b/test/Transforms/InstCombine/X86/x86-muldq.ll
diff --git a/test/Transforms/InstCombine/x86-pack.ll b/test/Transforms/InstCombine/X86/x86-pack.ll
index f3c41a8aa476..f3c41a8aa476 100644
--- a/test/Transforms/InstCombine/x86-pack.ll
+++ b/test/Transforms/InstCombine/X86/x86-pack.ll
diff --git a/test/Transforms/InstCombine/x86-pshufb.ll b/test/Transforms/InstCombine/X86/x86-pshufb.ll
index f181ef57fe20..f181ef57fe20 100644
--- a/test/Transforms/InstCombine/x86-pshufb.ll
+++ b/test/Transforms/InstCombine/X86/x86-pshufb.ll
diff --git a/test/Transforms/InstCombine/x86-sse.ll b/test/Transforms/InstCombine/X86/x86-sse.ll
index 6ed62a4e0224..6ed62a4e0224 100644
--- a/test/Transforms/InstCombine/x86-sse.ll
+++ b/test/Transforms/InstCombine/X86/x86-sse.ll
diff --git a/test/Transforms/InstCombine/x86-sse2.ll b/test/Transforms/InstCombine/X86/x86-sse2.ll
index fe8828bfb5b2..fe8828bfb5b2 100644
--- a/test/Transforms/InstCombine/x86-sse2.ll
+++ b/test/Transforms/InstCombine/X86/x86-sse2.ll
diff --git a/test/Transforms/InstCombine/x86-sse41.ll b/test/Transforms/InstCombine/X86/x86-sse41.ll
index 16975471b9e1..16975471b9e1 100644
--- a/test/Transforms/InstCombine/x86-sse41.ll
+++ b/test/Transforms/InstCombine/X86/x86-sse41.ll
diff --git a/test/Transforms/InstCombine/x86-sse4a.ll b/test/Transforms/InstCombine/X86/x86-sse4a.ll
index e36a73532259..e36a73532259 100644
--- a/test/Transforms/InstCombine/x86-sse4a.ll
+++ b/test/Transforms/InstCombine/X86/x86-sse4a.ll
diff --git a/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll b/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll
new file mode 100644
index 000000000000..5ad8e767d767
--- /dev/null
+++ b/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll
@@ -0,0 +1,110 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i16 @test1(float %f) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: [[TMP281:%.*]] = fadd float %f, -1.000000e+00
+; CHECK-NEXT: [[TMP373:%.*]] = fmul float [[TMP281]], 5.000000e-01
+; CHECK-NEXT: [[TMP374:%.*]] = insertelement <4 x float> undef, float [[TMP373]], i32 0
+; CHECK-NEXT: [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP374]], <4 x float> <float 6.553500e+04, float undef, float undef, float undef>)
+; CHECK-NEXT: [[TMP59:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP48]], <4 x float> <float 0.000000e+00, float undef, float undef, float undef>)
+; CHECK-NEXT: [[TMP_UPGRD_1:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[TMP59]])
+; CHECK-NEXT: [[TMP69:%.*]] = trunc i32 [[TMP_UPGRD_1]] to i16
+; CHECK-NEXT: ret i16 [[TMP69]]
+;
+ %tmp = insertelement <4 x float> undef, float %f, i32 0
+ %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1
+ %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
+ %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
+ %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
+ %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
+ %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
+ %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )
+ %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )
+ %tmp69 = trunc i32 %tmp.upgrd.1 to i16
+ ret i16 %tmp69
+}
+
+define i64 @test3(float %f, double %d) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT: [[V00:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V00]])
+; CHECK-NEXT: [[V10:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V10]])
+; CHECK-NEXT: [[V20:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V20]])
+; CHECK-NEXT: [[V30:%.*]] = insertelement <4 x float> undef, float %f, i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V30]])
+; CHECK-NEXT: [[V40:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V40]])
+; CHECK-NEXT: [[V50:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V50]])
+; CHECK-NEXT: [[V60:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V60]])
+; CHECK-NEXT: [[V70:%.*]] = insertelement <2 x double> undef, double %d, i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V70]])
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
+; CHECK-NEXT: ret i64 [[TMP15]]
+;
+ %v00 = insertelement <4 x float> undef, float %f, i32 0
+ %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
+ %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
+ %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
+ %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03)
+ %v10 = insertelement <4 x float> undef, float %f, i32 0
+ %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
+ %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
+ %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
+ %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13)
+ %v20 = insertelement <4 x float> undef, float %f, i32 0
+ %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
+ %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
+ %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
+ %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23)
+ %v30 = insertelement <4 x float> undef, float %f, i32 0
+ %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
+ %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
+ %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
+ %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33)
+ %v40 = insertelement <2 x double> undef, double %d, i32 0
+ %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
+ %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41)
+ %v50 = insertelement <2 x double> undef, double %d, i32 0
+ %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
+ %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51)
+ %v60 = insertelement <2 x double> undef, double %d, i32 0
+ %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
+ %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61)
+ %v70 = insertelement <2 x double> undef, double %d, i32 0
+ %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
+ %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71)
+ %tmp8 = add i32 %tmp0, %tmp2
+ %tmp9 = add i32 %tmp4, %tmp6
+ %tmp10 = add i32 %tmp8, %tmp9
+ %tmp11 = sext i32 %tmp10 to i64
+ %tmp12 = add i64 %tmp1, %tmp3
+ %tmp13 = add i64 %tmp5, %tmp7
+ %tmp14 = add i64 %tmp12, %tmp13
+ %tmp15 = add i64 %tmp11, %tmp14
+ ret i64 %tmp15
+}
+
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>)
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>)
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>)
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
diff --git a/test/Transforms/InstCombine/x86-vector-shifts.ll b/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
index 07934fbdfe72..07934fbdfe72 100644
--- a/test/Transforms/InstCombine/x86-vector-shifts.ll
+++ b/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
diff --git a/test/Transforms/InstCombine/x86-vperm2.ll b/test/Transforms/InstCombine/X86/x86-vperm2.ll
index 84f69aa25d24..84f69aa25d24 100644
--- a/test/Transforms/InstCombine/x86-vperm2.ll
+++ b/test/Transforms/InstCombine/X86/x86-vperm2.ll
diff --git a/test/Transforms/InstCombine/x86-vpermil.ll b/test/Transforms/InstCombine/X86/x86-vpermil.ll
index f68eb36c4b58..f68eb36c4b58 100644
--- a/test/Transforms/InstCombine/x86-vpermil.ll
+++ b/test/Transforms/InstCombine/X86/x86-vpermil.ll
diff --git a/test/Transforms/InstCombine/x86-xop.ll b/test/Transforms/InstCombine/X86/x86-xop.ll
index 03a3f921abb2..03a3f921abb2 100644
--- a/test/Transforms/InstCombine/x86-xop.ll
+++ b/test/Transforms/InstCombine/X86/x86-xop.ll
diff --git a/test/Transforms/InstCombine/add.ll b/test/Transforms/InstCombine/add.ll
index 648305d134cd..5f7101e8feca 100644
--- a/test/Transforms/InstCombine/add.ll
+++ b/test/Transforms/InstCombine/add.ll
@@ -27,6 +27,32 @@ define <2 x i32> @select_0_or_1_from_bool_vec(<2 x i1> %x) {
ret <2 x i32> %add
}
+; This is an 'andn' of the low bit.
+
+define i32 @flip_and_mask(i32 %x) {
+; CHECK-LABEL: @flip_and_mask(
+; CHECK-NEXT: [[TMP1:%.*]] = and i32 %x, 1
+; CHECK-NEXT: [[INC:%.*]] = xor i32 [[TMP1]], 1
+; CHECK-NEXT: ret i32 [[INC]]
+;
+ %shl = shl i32 %x, 31
+ %shr = ashr i32 %shl, 31
+ %inc = add i32 %shr, 1
+ ret i32 %inc
+}
+
+define <2 x i8> @flip_and_mask_splat(<2 x i8> %x) {
+; CHECK-LABEL: @flip_and_mask_splat(
+; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> %x, <i8 1, i8 1>
+; CHECK-NEXT: [[INC:%.*]] = and <2 x i8> [[TMP1]], <i8 1, i8 1>
+; CHECK-NEXT: ret <2 x i8> [[INC]]
+;
+ %shl = shl <2 x i8> %x, <i8 7, i8 7>
+ %shr = ashr <2 x i8> %shl, <i8 7, i8 7>
+ %inc = add <2 x i8> %shr, <i8 1, i8 1>
+ ret <2 x i8> %inc
+}
+
define i32 @test1(i32 %A) {
; CHECK-LABEL: @test1(
; CHECK-NEXT: ret i32 %A
diff --git a/test/Transforms/InstCombine/and.ll b/test/Transforms/InstCombine/and.ll
index 8ef7870891f0..7bb9b95b3179 100644
--- a/test/Transforms/InstCombine/and.ll
+++ b/test/Transforms/InstCombine/and.ll
@@ -310,7 +310,7 @@ define i8 @test27(i8 %A) {
ret i8 %E
}
-;; This is juse a zero extending shr.
+;; This is just a zero-extending shr.
define i32 @test28(i32 %X) {
; CHECK-LABEL: @test28(
; CHECK-NEXT: [[Y1:%.*]] = lshr i32 %X, 24
diff --git a/test/Transforms/InstCombine/bit-tracking.ll b/test/Transforms/InstCombine/bit-tracking.ll
deleted file mode 100644
index 51bbc0888836..000000000000
--- a/test/Transforms/InstCombine/bit-tracking.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; This file contains various testcases that require tracking whether bits are
-; set or cleared by various instructions.
-; RUN: opt < %s -instcombine -instcombine -S |\
-; RUN: not grep %ELIM
-
-; Reduce down to a single XOR
-define i32 @test3(i32 %B) {
- %ELIMinc = and i32 %B, 1 ; <i32> [#uses=1]
- %tmp.5 = xor i32 %ELIMinc, 1 ; <i32> [#uses=1]
- %ELIM7 = and i32 %B, -2 ; <i32> [#uses=1]
- %tmp.8 = or i32 %tmp.5, %ELIM7 ; <i32> [#uses=1]
- ret i32 %tmp.8
-}
-
-; Finally, a bigger case where we chain things together. This corresponds to
-; incrementing a single-bit bitfield, which should become just an xor.
-define i32 @test4(i32 %B) {
- %ELIM3 = shl i32 %B, 31 ; <i32> [#uses=1]
- %ELIM4 = ashr i32 %ELIM3, 31 ; <i32> [#uses=1]
- %inc = add i32 %ELIM4, 1 ; <i32> [#uses=1]
- %ELIM5 = and i32 %inc, 1 ; <i32> [#uses=1]
- %ELIM7 = and i32 %B, -2 ; <i32> [#uses=1]
- %tmp.8 = or i32 %ELIM5, %ELIM7 ; <i32> [#uses=1]
- ret i32 %tmp.8
-}
-
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 4621d33d4388..a4375a5cd57e 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -1432,3 +1432,41 @@ define <2 x i32> @test90() {
%tmp6 = bitcast <4 x half> <half undef, half undef, half undef, half 0xH3C00> to <2 x i32>
ret <2 x i32> %tmp6
}
+
+; Do not optimize to ashr i64 (shift by 48 > 96 - 64)
+define i64 @test91(i64 %A) {
+; CHECK-LABEL: @test91(
+; CHECK-NEXT: [[B:%.*]] = sext i64 %A to i96
+; CHECK-NEXT: [[C:%.*]] = lshr i96 [[B]], 48
+; CHECK-NEXT: [[D:%.*]] = trunc i96 [[C]] to i64
+; CHECK-NEXT: ret i64 [[D]]
+;
+ %B = sext i64 %A to i96
+ %C = lshr i96 %B, 48
+ %D = trunc i96 %C to i64
+ ret i64 %D
+}
+
+; Do optimize to ashr i64 (shift by 32 <= 96 - 64)
+define i64 @test92(i64 %A) {
+; CHECK-LABEL: @test92(
+; CHECK-NEXT: [[C:%.*]] = ashr i64 %A, 32
+; CHECK-NEXT: ret i64 [[C]]
+;
+ %B = sext i64 %A to i96
+ %C = lshr i96 %B, 32
+ %D = trunc i96 %C to i64
+ ret i64 %D
+}
+
+; When optimizing to ashr i32, don't shift by more than 31.
+define i32 @test93(i32 %A) {
+; CHECK-LABEL: @test93(
+; CHECK-NEXT: [[C:%.*]] = ashr i32 %A, 31
+; CHECK-NEXT: ret i32 [[C]]
+;
+ %B = sext i32 %A to i96
+ %C = lshr i96 %B, 64
+ %D = trunc i96 %C to i32
+ ret i32 %D
+}
diff --git a/test/Transforms/InstCombine/constant-fold-iteration.ll b/test/Transforms/InstCombine/constant-fold-iteration.ll
new file mode 100644
index 000000000000..e1b692173ce8
--- /dev/null
+++ b/test/Transforms/InstCombine/constant-fold-iteration.ll
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S -debug 2>&1 | FileCheck %s
+; REQUIRES: asserts
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
+
+define i32 @a() nounwind readnone {
+entry:
+ ret i32 zext (i1 icmp eq (i32 0, i32 ptrtoint (i32 ()* @a to i32)) to i32)
+}
+; CHECK: INSTCOMBINE ITERATION #1
+; CHECK-NOT: INSTCOMBINE ITERATION #2
diff --git a/test/Transforms/InstCombine/demorgan.ll b/test/Transforms/InstCombine/demorgan.ll
index 26c2270a3fdf..8c3d3b830468 100644
--- a/test/Transforms/InstCombine/demorgan.ll
+++ b/test/Transforms/InstCombine/demorgan.ll
@@ -399,7 +399,7 @@ define i32 @demorgan_or_zext(i1 %X, i1 %Y) {
; CHECK-LABEL: @demorgan_or_zext(
; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and i1 %X, %Y
; CHECK-NEXT: [[OR1:%.*]] = xor i1 [[OR1_DEMORGAN]], true
-; CHECK-NEXT: [[OR:%.*]] = zext i1 [[OR:%.*]]1 to i32
+; CHECK-NEXT: [[OR:%.*]] = zext i1 [[OR1]] to i32
; CHECK-NEXT: ret i32 [[OR]]
;
%zextX = zext i1 %X to i32
@@ -414,7 +414,7 @@ define i32 @demorgan_and_zext(i1 %X, i1 %Y) {
; CHECK-LABEL: @demorgan_and_zext(
; CHECK-NEXT: [[AND1_DEMORGAN:%.*]] = or i1 %X, %Y
; CHECK-NEXT: [[AND1:%.*]] = xor i1 [[AND1_DEMORGAN]], true
-; CHECK-NEXT: [[AND:%.*]] = zext i1 [[AND:%.*]]1 to i32
+; CHECK-NEXT: [[AND:%.*]] = zext i1 [[AND1]] to i32
; CHECK-NEXT: ret i32 [[AND]]
;
%zextX = zext i1 %X to i32
@@ -429,7 +429,7 @@ define <2 x i32> @demorgan_or_zext_vec(<2 x i1> %X, <2 x i1> %Y) {
; CHECK-LABEL: @demorgan_or_zext_vec(
; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and <2 x i1> %X, %Y
; CHECK-NEXT: [[OR1:%.*]] = xor <2 x i1> [[OR1_DEMORGAN]], <i1 true, i1 true>
-; CHECK-NEXT: [[OR:%.*]] = zext <2 x i1> [[OR:%.*]]1 to <2 x i32>
+; CHECK-NEXT: [[OR:%.*]] = zext <2 x i1> [[OR1]] to <2 x i32>
; CHECK-NEXT: ret <2 x i32> [[OR]]
;
%zextX = zext <2 x i1> %X to <2 x i32>
@@ -444,7 +444,7 @@ define <2 x i32> @demorgan_and_zext_vec(<2 x i1> %X, <2 x i1> %Y) {
; CHECK-LABEL: @demorgan_and_zext_vec(
; CHECK-NEXT: [[AND1_DEMORGAN:%.*]] = or <2 x i1> %X, %Y
; CHECK-NEXT: [[AND1:%.*]] = xor <2 x i1> [[AND1_DEMORGAN]], <i1 true, i1 true>
-; CHECK-NEXT: [[AND:%.*]] = zext <2 x i1> [[AND:%.*]]1 to <2 x i32>
+; CHECK-NEXT: [[AND:%.*]] = zext <2 x i1> [[AND1]] to <2 x i32>
; CHECK-NEXT: ret <2 x i32> [[AND]]
;
%zextX = zext <2 x i1> %X to <2 x i32>
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index edfa9a102917..6f657b190454 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -695,6 +695,21 @@ define i1 @test48(i32 %X, i32 %Y, i32 %Z) {
ret i1 %C
}
+; The above transform only works for equality predicates.
+
+define i1 @PR32949(i32 %X, i32 %Y, i32 %Z) {
+; CHECK-LABEL: @PR32949(
+; CHECK-NEXT: [[A:%.*]] = sdiv exact i32 %X, %Z
+; CHECK-NEXT: [[B:%.*]] = sdiv exact i32 %Y, %Z
+; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[A]], [[B]]
+; CHECK-NEXT: ret i1 [[C]]
+;
+ %A = sdiv exact i32 %X, %Z
+ %B = sdiv exact i32 %Y, %Z
+ %C = icmp sgt i32 %A, %B
+ ret i1 %C
+}
+
; PR8469
define <2 x i1> @test49(<2 x i32> %tmp3) {
; CHECK-LABEL: @test49(
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 66ab7f48aeff..5654b265da58 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -instcombine -S < %s | FileCheck %s
%overflow.result = type {i8, i1}
@@ -283,14 +284,24 @@ define i32 @cttz(i32 %a) {
define i1 @cttz_knownbits(i32 %arg) {
; CHECK-LABEL: @cttz_knownbits(
+; CHECK-NEXT: ret i1 false
+;
+ %or = or i32 %arg, 4
+ %cnt = call i32 @llvm.cttz.i32(i32 %or, i1 true) nounwind readnone
+ %res = icmp eq i32 %cnt, 4
+ ret i1 %res
+}
+
+define i1 @cttz_knownbits2(i32 %arg) {
+; CHECK-LABEL: @cttz_knownbits2(
; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG:%.*]], 4
; CHECK-NEXT: [[CNT:%.*]] = call i32 @llvm.cttz.i32(i32 [[OR]], i1 true)
-; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CNT]], 4
+; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CNT]], 2
; CHECK-NEXT: ret i1 [[RES]]
;
%or = or i32 %arg, 4
%cnt = call i32 @llvm.cttz.i32(i32 %or, i1 true) nounwind readnone
- %res = icmp eq i32 %cnt, 4
+ %res = icmp eq i32 %cnt, 2
ret i1 %res
}
@@ -306,14 +317,24 @@ define i8 @ctlz(i8 %a) {
define i1 @ctlz_knownbits(i8 %arg) {
; CHECK-LABEL: @ctlz_knownbits(
+; CHECK-NEXT: ret i1 false
+;
+ %or = or i8 %arg, 32
+ %cnt = call i8 @llvm.ctlz.i8(i8 %or, i1 true) nounwind readnone
+ %res = icmp eq i8 %cnt, 4
+ ret i1 %res
+}
+
+define i1 @ctlz_knownbits2(i8 %arg) {
+; CHECK-LABEL: @ctlz_knownbits2(
; CHECK-NEXT: [[OR:%.*]] = or i8 [[ARG:%.*]], 32
; CHECK-NEXT: [[CNT:%.*]] = call i8 @llvm.ctlz.i8(i8 [[OR]], i1 true)
-; CHECK-NEXT: [[RES:%.*]] = icmp eq i8 [[CNT]], 4
+; CHECK-NEXT: [[RES:%.*]] = icmp eq i8 [[CNT]], 2
; CHECK-NEXT: ret i1 [[RES]]
;
%or = or i8 %arg, 32
%cnt = call i8 @llvm.ctlz.i8(i8 %or, i1 true) nounwind readnone
- %res = icmp eq i8 %cnt, 4
+ %res = icmp eq i8 %cnt, 2
ret i1 %res
}
diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index 3ab40c4de92d..7f0bd23eb8a5 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -62,6 +62,81 @@ define i32 @poo(i32 %a, i32 %b, i32 %c, i32 %d) {
ret i32 %t3
}
+; TODO: For the next 4 tests, are there potential canonicalizations and/or folds for these
+; in InstCombine? Independent of that, tests like this that may not show any transforms
+; still have value because they can help identify conflicting canonicalization rules that
+; lead to infinite looping.
+
+; PR32791 - https://bugs.llvm.org//show_bug.cgi?id=32791
+; Fold two selects with inverted predicates and zero operands.
+define i32 @fold_inverted_icmp_preds(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @fold_inverted_icmp_preds(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 %a, %b
+; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP1]], i32 %c, i32 0
+; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 %a, %b
+; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[CMP2]], i32 %d, i32 0
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]]
+; CHECK-NEXT: ret i32 [[OR]]
+;
+ %cmp1 = icmp slt i32 %a, %b
+ %sel1 = select i1 %cmp1, i32 %c, i32 0
+ %cmp2 = icmp sge i32 %a, %b
+ %sel2 = select i1 %cmp2, i32 %d, i32 0
+ %or = or i32 %sel1, %sel2
+ ret i32 %or
+}
+
+define i32 @fold_inverted_icmp_preds_reverse(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @fold_inverted_icmp_preds_reverse(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 %a, %b
+; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP1]], i32 0, i32 %c
+; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 %a, %b
+; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[CMP2]], i32 0, i32 %d
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]]
+; CHECK-NEXT: ret i32 [[OR]]
+;
+ %cmp1 = icmp slt i32 %a, %b
+ %sel1 = select i1 %cmp1, i32 0, i32 %c
+ %cmp2 = icmp sge i32 %a, %b
+ %sel2 = select i1 %cmp2, i32 0, i32 %d
+ %or = or i32 %sel1, %sel2
+ ret i32 %or
+}
+
+define i32 @fold_inverted_fcmp_preds(float %a, float %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @fold_inverted_fcmp_preds(
+; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float %a, %b
+; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP1]], i32 %c, i32 0
+; CHECK-NEXT: [[CMP2:%.*]] = fcmp uge float %a, %b
+; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[CMP2]], i32 %d, i32 0
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]]
+; CHECK-NEXT: ret i32 [[OR]]
+;
+ %cmp1 = fcmp olt float %a, %b
+ %sel1 = select i1 %cmp1, i32 %c, i32 0
+ %cmp2 = fcmp uge float %a, %b
+ %sel2 = select i1 %cmp2, i32 %d, i32 0
+ %or = or i32 %sel1, %sel2
+ ret i32 %or
+}
+
+define <2 x i32> @fold_inverted_icmp_vector_preds(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
+; CHECK-LABEL: @fold_inverted_icmp_vector_preds(
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ne <2 x i32> %a, %b
+; CHECK-NEXT: [[SEL1:%.*]] = select <2 x i1> [[CMP1]], <2 x i32> %c, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq <2 x i32> %a, %b
+; CHECK-NEXT: [[SEL2:%.*]] = select <2 x i1> [[CMP2]], <2 x i32> %d, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[SEL1]], [[SEL2]]
+; CHECK-NEXT: ret <2 x i32> [[OR]]
+;
+ %cmp1 = icmp ne <2 x i32> %a, %b
+ %sel1 = select <2 x i1> %cmp1, <2 x i32> %c, <2 x i32> <i32 0, i32 0>
+ %cmp2 = icmp eq <2 x i32> %a, %b
+ %sel2 = select <2 x i1> %cmp2, <2 x i32> %d, <2 x i32> <i32 0, i32 0>
+ %or = or <2 x i32> %sel1, %sel2
+ ret <2 x i32> %or
+}
+
define i32 @par(i32 %a, i32 %b, i32 %c, i32 %d) {
; CHECK-LABEL: @par(
; CHECK-NEXT: [[T0:%.*]] = icmp slt i32 %a, %b
diff --git a/test/Transforms/InstCombine/not.ll b/test/Transforms/InstCombine/not.ll
index 2760d4ae044d..6ff0a50318d2 100644
--- a/test/Transforms/InstCombine/not.ll
+++ b/test/Transforms/InstCombine/not.ll
@@ -11,8 +11,8 @@ define i32 @test1(i32 %A) {
define i1 @invert_icmp(i32 %A, i32 %B) {
; CHECK-LABEL: @invert_icmp(
-; CHECK-NEXT: [[NOT:%.*]] = icmp sgt i32 %A, %B
-; CHECK-NEXT: ret i1 [[NOT]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 %A, %B
+; CHECK-NEXT: ret i1 [[CMP]]
;
%cmp = icmp sle i32 %A, %B
%not = xor i1 %cmp, true
@@ -23,8 +23,8 @@ define i1 @invert_icmp(i32 %A, i32 %B) {
define i1 @invert_fcmp(float %X, float %Y) {
; CHECK-LABEL: @invert_fcmp(
-; CHECK-NEXT: [[NOT:%.*]] = fcmp uge float %X, %Y
-; CHECK-NEXT: ret i1 [[NOT]]
+; CHECK-NEXT: [[CMP:%.*]] = fcmp uge float %X, %Y
+; CHECK-NEXT: ret i1 [[CMP]]
;
%cmp = fcmp olt float %X, %Y
%not = xor i1 %cmp, true
@@ -48,11 +48,75 @@ define zeroext i8 @test6(i32 %a, i32 %b) {
define <2 x i1> @test7(<2 x i32> %A, <2 x i32> %B) {
; CHECK-LABEL: @test7(
-; CHECK-NEXT: [[RET:%.*]] = icmp sgt <2 x i32> %A, %B
-; CHECK-NEXT: ret <2 x i1> [[RET]]
+; CHECK-NEXT: [[COND:%.*]] = icmp sgt <2 x i32> %A, %B
+; CHECK-NEXT: ret <2 x i1> [[COND]]
;
%cond = icmp sle <2 x i32> %A, %B
%Ret = xor <2 x i1> %cond, <i1 true, i1 true>
ret <2 x i1> %Ret
}
+define i32 @not_ashr_not(i32 %A, i32 %B) {
+; CHECK-LABEL: @not_ashr_not(
+; CHECK-NEXT: [[NOT2:%.*]] = ashr i32 %A, %B
+; CHECK-NEXT: ret i32 [[NOT2]]
+;
+ %not1 = xor i32 %A, -1
+ %ashr = ashr i32 %not1, %B
+ %not2 = xor i32 %ashr, -1
+ ret i32 %not2
+}
+
+define i8 @not_ashr_const(i8 %x) {
+; CHECK-LABEL: @not_ashr_const(
+; CHECK-NEXT: [[NOT:%.*]] = lshr i8 41, %x
+; CHECK-NEXT: ret i8 [[NOT]]
+;
+ %shr = ashr i8 -42, %x
+ %not = xor i8 %shr, -1
+ ret i8 %not
+}
+
+define <2 x i8> @not_ashr_const_splat(<2 x i8> %x) {
+; CHECK-LABEL: @not_ashr_const_splat(
+; CHECK-NEXT: [[NOT:%.*]] = lshr <2 x i8> <i8 41, i8 41>, %x
+; CHECK-NEXT: ret <2 x i8> [[NOT]]
+;
+ %shr = ashr <2 x i8> <i8 -42, i8 -42>, %x
+ %not = xor <2 x i8> %shr, <i8 -1, i8 -1>
+ ret <2 x i8> %not
+}
+
+; We can't get rid of the 'not' on a logical shift of a negative constant.
+
+define i8 @not_lshr_const_negative(i8 %x) {
+; CHECK-LABEL: @not_lshr_const_negative(
+; CHECK-NEXT: [[SHR:%.*]] = lshr i8 -42, %x
+; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[SHR]], -1
+; CHECK-NEXT: ret i8 [[NOT]]
+;
+ %shr = lshr i8 -42, %x
+ %not = xor i8 %shr, -1
+ ret i8 %not
+}
+
+define i8 @not_lshr_const(i8 %x) {
+; CHECK-LABEL: @not_lshr_const(
+; CHECK-NEXT: [[NOT:%.*]] = ashr i8 -43, %x
+; CHECK-NEXT: ret i8 [[NOT]]
+;
+ %shr = lshr i8 42, %x
+ %not = xor i8 %shr, -1
+ ret i8 %not
+}
+
+define <2 x i8> @not_lshr_const_splat(<2 x i8> %x) {
+; CHECK-LABEL: @not_lshr_const_splat(
+; CHECK-NEXT: [[NOT:%.*]] = ashr <2 x i8> <i8 -43, i8 -43>, %x
+; CHECK-NEXT: ret <2 x i8> [[NOT]]
+;
+ %shr = lshr <2 x i8> <i8 42, i8 42>, %x
+ %not = xor <2 x i8> %shr, <i8 -1, i8 -1>
+ ret <2 x i8> %not
+}
+
diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll
index ec5b71656a47..f2bc290d79a4 100644
--- a/test/Transforms/InstCombine/or-xor.ll
+++ b/test/Transforms/InstCombine/or-xor.ll
@@ -230,3 +230,73 @@ define i32 @test16(i32 %a, i32 %b) {
%xor = or i32 %and1, %and2
ret i32 %xor
}
+
+define i8 @not_or(i8 %x) {
+; CHECK-LABEL: @not_or(
+; CHECK-NEXT: [[NOTX:%.*]] = or i8 %x, 7
+; CHECK-NEXT: [[OR:%.*]] = xor i8 [[NOTX]], -8
+; CHECK-NEXT: ret i8 [[OR]]
+;
+ %notx = xor i8 %x, -1
+ %or = or i8 %notx, 7
+ ret i8 %or
+}
+
+define i8 @not_or_xor(i8 %x) {
+; CHECK-LABEL: @not_or_xor(
+; CHECK-NEXT: [[NOTX:%.*]] = or i8 %x, 7
+; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[NOTX]], -12
+; CHECK-NEXT: ret i8 [[XOR]]
+;
+ %notx = xor i8 %x, -1
+ %or = or i8 %notx, 7
+ %xor = xor i8 %or, 12
+ ret i8 %xor
+}
+
+define i8 @xor_or(i8 %x) {
+; CHECK-LABEL: @xor_or(
+; CHECK-NEXT: [[XOR:%.*]] = or i8 %x, 7
+; CHECK-NEXT: [[OR:%.*]] = xor i8 [[XOR]], 32
+; CHECK-NEXT: ret i8 [[OR]]
+;
+ %xor = xor i8 %x, 32
+ %or = or i8 %xor, 7
+ ret i8 %or
+}
+
+define i8 @xor_or2(i8 %x) {
+; CHECK-LABEL: @xor_or2(
+; CHECK-NEXT: [[XOR:%.*]] = or i8 %x, 7
+; CHECK-NEXT: [[OR:%.*]] = xor i8 [[XOR]], 32
+; CHECK-NEXT: ret i8 [[OR]]
+;
+ %xor = xor i8 %x, 33
+ %or = or i8 %xor, 7
+ ret i8 %or
+}
+
+define i8 @xor_or_xor(i8 %x) {
+; CHECK-LABEL: @xor_or_xor(
+; CHECK-NEXT: [[XOR1:%.*]] = or i8 %x, 7
+; CHECK-NEXT: [[XOR2:%.*]] = xor i8 [[XOR1]], 44
+; CHECK-NEXT: ret i8 [[XOR2]]
+;
+ %xor1 = xor i8 %x, 33
+ %or = or i8 %xor1, 7
+ %xor2 = xor i8 %or, 12
+ ret i8 %xor2
+}
+
+define i8 @or_xor_or(i8 %x) {
+; CHECK-LABEL: @or_xor_or(
+; CHECK-NEXT: [[XOR:%.*]] = or i8 %x, 39
+; CHECK-NEXT: [[OR2:%.*]] = xor i8 [[XOR]], 8
+; CHECK-NEXT: ret i8 [[OR2]]
+;
+ %or1 = or i8 %x, 33
+ %xor = xor i8 %or1, 12
+ %or2 = or i8 %xor, 7
+ ret i8 %or2
+}
+
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index bfafd66ebb41..764fe4503b5e 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -3,115 +3,6 @@
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-define i32 @test1(i32 %A) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: ret i32 %A
-;
- %B = or i32 %A, 0
- ret i32 %B
-}
-
-define i32 @test2(i32 %A) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: ret i32 -1
-;
- %B = or i32 %A, -1
- ret i32 %B
-}
-
-define i8 @test2a(i8 %A) {
-; CHECK-LABEL: @test2a(
-; CHECK-NEXT: ret i8 -1
-;
- %B = or i8 %A, -1
- ret i8 %B
-}
-
-define i1 @test3(i1 %A) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT: ret i1 %A
-;
- %B = or i1 %A, false
- ret i1 %B
-}
-
-define i1 @test4(i1 %A) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT: ret i1 true
-;
- %B = or i1 %A, true
- ret i1 %B
-}
-
-define i1 @test5(i1 %A) {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT: ret i1 %A
-;
- %B = or i1 %A, %A
- ret i1 %B
-}
-
-define i32 @test6(i32 %A) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT: ret i32 %A
-;
- %B = or i32 %A, %A
- ret i32 %B
-}
-
-; A | ~A == -1
-define i32 @test7(i32 %A) {
-; CHECK-LABEL: @test7(
-; CHECK-NEXT: ret i32 -1
-;
- %NotA = xor i32 -1, %A
- %B = or i32 %A, %NotA
- ret i32 %B
-}
-
-define i8 @test8(i8 %A) {
-; CHECK-LABEL: @test8(
-; CHECK-NEXT: ret i8 -1
-;
- %B = or i8 %A, -2
- %C = or i8 %B, 1
- ret i8 %C
-}
-
-; Test that (A|c1)|(B|c2) == (A|B)|(c1|c2)
-define i8 @test9(i8 %A, i8 %B) {
-; CHECK-LABEL: @test9(
-; CHECK-NEXT: ret i8 -1
-;
- %C = or i8 %A, 1
- %D = or i8 %B, -2
- %E = or i8 %C, %D
- ret i8 %E
-}
-
-define i8 @test10(i8 %A) {
-; CHECK-LABEL: @test10(
-; CHECK-NEXT: ret i8 -2
-;
- %B = or i8 %A, 1
- %C = and i8 %B, -2
- ; (X & C1) | C2 --> (X | C2) & (C1|C2)
- %D = or i8 %C, -2
- ret i8 %D
-}
-
-define i8 @test11(i8 %A) {
-; CHECK-LABEL: @test11(
-; CHECK-NEXT: ret i8 -1
-;
- %B = or i8 %A, -2
- %C = xor i8 %B, 13
- ; (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
- %D = or i8 %C, 1
- %E = xor i8 %D, 12
- ret i8 %E
-}
-
define i32 @test12(i32 %A) {
; Should be eliminated
; CHECK-LABEL: @test12(
diff --git a/test/Transforms/InstCombine/sext.ll b/test/Transforms/InstCombine/sext.ll
index 4cdd080fb0e0..46406ac2f788 100644
--- a/test/Transforms/InstCombine/sext.ll
+++ b/test/Transforms/InstCombine/sext.ll
@@ -128,7 +128,7 @@ F:
define i32 @test10(i32 %i) {
; CHECK-LABEL: @test10(
; CHECK-NEXT: [[B1:%.*]] = shl i32 %i, 30
-; CHECK-NEXT: [[B:%.*]] = ashr exact i32 [[B:%.*]]1, 30
+; CHECK-NEXT: [[B:%.*]] = ashr exact i32 [[B1]], 30
; CHECK-NEXT: ret i32 [[B]]
;
%tmp12 = trunc i32 %i to i8
diff --git a/test/Transforms/InstCombine/trunc.ll b/test/Transforms/InstCombine/trunc.ll
index 5597b578f017..dd86e5a907b8 100644
--- a/test/Transforms/InstCombine/trunc.ll
+++ b/test/Transforms/InstCombine/trunc.ll
@@ -24,7 +24,7 @@ define i64 @test2(i64 %a) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: [[B:%.*]] = trunc i64 %a to i32
; CHECK-NEXT: [[D1:%.*]] = shl i64 %a, 36
-; CHECK-NEXT: [[D:%.*]] = ashr exact i64 [[D:%.*]]1, 36
+; CHECK-NEXT: [[D:%.*]] = ashr exact i64 [[D1]], 36
; CHECK-NEXT: call void @use(i32 [[B]])
; CHECK-NEXT: ret i64 [[D]]
;
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index 5f27634da19c..00efbe00b08d 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -2,30 +2,6 @@
; RUN: opt < %s -instcombine -S | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-define i16 @test1(float %f) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: [[TMP281:%.*]] = fadd float %f, -1.000000e+00
-; CHECK-NEXT: [[TMP373:%.*]] = fmul float [[TMP281]], 5.000000e-01
-; CHECK-NEXT: [[TMP374:%.*]] = insertelement <4 x float> undef, float [[TMP373]], i32 0
-; CHECK-NEXT: [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP374]], <4 x float> <float 6.553500e+04, float undef, float undef, float undef>)
-; CHECK-NEXT: [[TMP59:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP48]], <4 x float> <float 0.000000e+00, float undef, float undef, float undef>)
-; CHECK-NEXT: [[TMP_UPGRD_1:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[TMP59]])
-; CHECK-NEXT: [[TMP69:%.*]] = trunc i32 [[TMP_UPGRD_1]] to i16
-; CHECK-NEXT: ret i16 [[TMP69]]
-;
- %tmp = insertelement <4 x float> undef, float %f, i32 0
- %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1
- %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
- %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
- %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
- %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
- %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )
- %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )
- %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )
- %tmp69 = trunc i32 %tmp.upgrd.1 to i16
- ret i16 %tmp69
-}
-
define i32 @test2(float %f) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: [[TMP5:%.*]] = fmul float %f, %f
@@ -42,77 +18,6 @@ define i32 @test2(float %f) {
ret i32 %tmp21
}
-define i64 @test3(float %f, double %d) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT: [[V00:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V00]])
-; CHECK-NEXT: [[V10:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V10]])
-; CHECK-NEXT: [[V20:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V20]])
-; CHECK-NEXT: [[V30:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V30]])
-; CHECK-NEXT: [[V40:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V40]])
-; CHECK-NEXT: [[V50:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V50]])
-; CHECK-NEXT: [[V60:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V60]])
-; CHECK-NEXT: [[V70:%.*]] = insertelement <2 x double> undef, double %d, i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V70]])
-; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
-; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
-; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
-; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
-; CHECK-NEXT: ret i64 [[TMP15]]
-;
- %v00 = insertelement <4 x float> undef, float %f, i32 0
- %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
- %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
- %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
- %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03)
- %v10 = insertelement <4 x float> undef, float %f, i32 0
- %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
- %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
- %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
- %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13)
- %v20 = insertelement <4 x float> undef, float %f, i32 0
- %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
- %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
- %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
- %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23)
- %v30 = insertelement <4 x float> undef, float %f, i32 0
- %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
- %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
- %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
- %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33)
- %v40 = insertelement <2 x double> undef, double %d, i32 0
- %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
- %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41)
- %v50 = insertelement <2 x double> undef, double %d, i32 0
- %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
- %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51)
- %v60 = insertelement <2 x double> undef, double %d, i32 0
- %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
- %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61)
- %v70 = insertelement <2 x double> undef, double %d, i32 0
- %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
- %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71)
- %tmp8 = add i32 %tmp0, %tmp2
- %tmp9 = add i32 %tmp4, %tmp6
- %tmp10 = add i32 %tmp8, %tmp9
- %tmp11 = sext i32 %tmp10 to i64
- %tmp12 = add i64 %tmp1, %tmp3
- %tmp13 = add i64 %tmp5, %tmp7
- %tmp14 = add i64 %tmp12, %tmp13
- %tmp15 = add i64 %tmp11, %tmp14
- ret i64 %tmp15
-}
-
define void @get_image() nounwind {
; CHECK-LABEL: @get_image(
; CHECK-NEXT: entry:
@@ -156,18 +61,6 @@ entry:
}
declare i32 @fgetc(i8*)
-declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
-declare i32 @llvm.x86.sse.cvtss2si(<4 x float>)
-declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>)
-declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
-declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>)
-declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
-declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
-declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
-declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind {
; CHECK-LABEL: @dead_shuffle_elt(
@@ -248,4 +141,3 @@ define <2 x i64> @PR24922(<2 x i64> %v) {
%result = select <2 x i1> <i1 icmp eq (i64 extractelement (<2 x i64> bitcast (<4 x i32> <i32 15, i32 15, i32 15, i32 15> to <2 x i64>), i64 0), i64 0), i1 true>, <2 x i64> %v, <2 x i64> zeroinitializer
ret <2 x i64> %result
}
-
diff --git a/test/Transforms/InstCombine/xor2.ll b/test/Transforms/InstCombine/xor2.ll
index f817ac5ca40c..3afbf632f6e1 100644
--- a/test/Transforms/InstCombine/xor2.ll
+++ b/test/Transforms/InstCombine/xor2.ll
@@ -57,17 +57,6 @@ define i32 @test3(i32 %tmp1) {
ret i32 %ov110
}
-define i32 @test4(i32 %A, i32 %B) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 %A, %B
-; CHECK-NEXT: ret i32 [[TMP1]]
-;
- %1 = xor i32 %A, -1
- %2 = ashr i32 %1, %B
- %3 = xor i32 %2, -1
- ret i32 %3
-}
-
; defect-2 in rdar://12329730
; (X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3)
; where the "X" has more than one use
diff --git a/test/Transforms/InstNamer/basic.ll b/test/Transforms/InstNamer/basic.ll
new file mode 100644
index 000000000000..4c819246b90b
--- /dev/null
+++ b/test/Transforms/InstNamer/basic.ll
@@ -0,0 +1,19 @@
+; RUN: opt -S -instnamer < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @f_0(i32) {
+; CHECK-LABEL: @f_0(
+; CHECK: bb:
+; CHECK-NEXT: %tmp = add i32 %arg, 2
+; CHECK-NEXT: br label %bb1
+; CHECK: bb1:
+; CHECK-NEXT: ret i32 %tmp
+
+ %2 = add i32 %0, 2
+ br label %3
+
+; <label>:3:
+ ret i32 %2
+}
diff --git a/test/Transforms/InstSimplify/AndOrXor.ll b/test/Transforms/InstSimplify/AndOrXor.ll
index e059d77f1fa8..427ea655fcb2 100644
--- a/test/Transforms/InstSimplify/AndOrXor.ll
+++ b/test/Transforms/InstSimplify/AndOrXor.ll
@@ -628,3 +628,176 @@ define i32 @test46_commuted_and(i32 %a, i32 %b) {
%or = or i32 %xor, %and
ret i32 %or
}
+
+; (~A ^ B) | (A & B) -> ~A ^ B
+
+define i32 @test47(i32 %a, i32 %b) {
+; CHECK-LABEL: @test47(
+; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
+; CHECK-NEXT: ret i32 [[XOR]]
+;
+ %nega = xor i32 %a, -1
+ %and = and i32 %a, %b
+ %xor = xor i32 %nega, %b
+ %or = or i32 %xor, %and
+ ret i32 %or
+}
+
+define i32 @test48(i32 %a, i32 %b) {
+; CHECK-LABEL: @test48(
+; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
+; CHECK-NEXT: ret i32 [[XOR]]
+;
+ %nega = xor i32 %a, -1
+ %and = and i32 %a, %b
+ %xor = xor i32 %b, %nega
+ %or = or i32 %xor, %and
+ ret i32 %or
+}
+
+define i32 @test49(i32 %a, i32 %b) {
+; CHECK-LABEL: @test49(
+; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
+; CHECK-NEXT: ret i32 [[XOR]]
+;
+ %nega = xor i32 %a, -1
+ %and = and i32 %b, %a
+ %xor = xor i32 %b, %nega
+ %or = or i32 %xor, %and
+ ret i32 %or
+}
+
+define i32 @test50(i32 %a, i32 %b) {
+; CHECK-LABEL: @test50(
+; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
+; CHECK-NEXT: ret i32 [[XOR]]
+;
+ %nega = xor i32 %a, -1
+ %and = and i32 %b, %a
+ %xor = xor i32 %nega, %b
+ %or = or i32 %xor, %and
+ ret i32 %or
+}
+
+define i32 @test51(i32 %a, i32 %b) {
+; CHECK-LABEL: @test51(
+; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
+; CHECK-NEXT: ret i32 [[XOR]]
+;
+ %nega = xor i32 %a, -1
+ %and = and i32 %a, %b
+ %xor = xor i32 %nega, %b
+ %or = or i32 %and, %xor
+ ret i32 %or
+}
+
+define i32 @test52(i32 %a, i32 %b) {
+; CHECK-LABEL: @test52(
+; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
+; CHECK-NEXT: ret i32 [[XOR]]
+;
+ %nega = xor i32 %a, -1
+ %and = and i32 %a, %b
+ %xor = xor i32 %b, %nega
+ %or = or i32 %and, %xor
+ ret i32 %or
+}
+
+define i32 @test53(i32 %a, i32 %b) {
+; CHECK-LABEL: @test53(
+; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]]
+; CHECK-NEXT: ret i32 [[XOR]]
+;
+ %nega = xor i32 %a, -1
+ %and = and i32 %b, %a
+ %xor = xor i32 %b, %nega
+ %or = or i32 %and, %xor
+ ret i32 %or
+}
+
+define i32 @test54(i32 %a, i32 %b) {
+; CHECK-LABEL: @test54(
+; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]]
+; CHECK-NEXT: ret i32 [[XOR]]
+;
+ %nega = xor i32 %a, -1
+ %and = and i32 %b, %a
+ %xor = xor i32 %nega, %b
+ %or = or i32 %and, %xor
+ ret i32 %or
+}
+
+define i8 @lshr_perfect_mask(i8 %x) {
+; CHECK-LABEL: @lshr_perfect_mask(
+; CHECK-NEXT: [[SH:%.*]] = lshr i8 %x, 5
+; CHECK-NEXT: [[MASK:%.*]] = and i8 [[SH]], 7
+; CHECK-NEXT: ret i8 [[MASK]]
+;
+ %sh = lshr i8 %x, 5
+ %mask = and i8 %sh, 7 ; 0x07
+ ret i8 %mask
+}
+
+define <2 x i8> @lshr_oversized_mask_splat(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_oversized_mask_splat(
+; CHECK-NEXT: [[SH:%.*]] = lshr <2 x i8> %x, <i8 5, i8 5>
+; CHECK-NEXT: [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -121, i8 -121>
+; CHECK-NEXT: ret <2 x i8> [[MASK]]
+;
+ %sh = lshr <2 x i8> %x, <i8 5, i8 5>
+ %mask = and <2 x i8> %sh, <i8 135, i8 135> ; 0x87
+ ret <2 x i8> %mask
+}
+
+define i8 @lshr_undersized_mask(i8 %x) {
+; CHECK-LABEL: @lshr_undersized_mask(
+; CHECK-NEXT: [[SH:%.*]] = lshr i8 %x, 5
+; CHECK-NEXT: [[MASK:%.*]] = and i8 [[SH]], -2
+; CHECK-NEXT: ret i8 [[MASK]]
+;
+ %sh = lshr i8 %x, 5
+ %mask = and i8 %sh, -2 ; 0xFE
+ ret i8 %mask
+}
+
+define <2 x i8> @shl_perfect_mask_splat(<2 x i8> %x) {
+; CHECK-LABEL: @shl_perfect_mask_splat(
+; CHECK-NEXT: [[SH:%.*]] = shl <2 x i8> %x, <i8 6, i8 6>
+; CHECK-NEXT: [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -64, i8 -64>
+; CHECK-NEXT: ret <2 x i8> [[MASK]]
+;
+ %sh = shl <2 x i8> %x, <i8 6, i8 6>
+ %mask = and <2 x i8> %sh, <i8 192, i8 192> ; 0xC0
+ ret <2 x i8> %mask
+}
+
+define i8 @shl_oversized_mask(i8 %x) {
+; CHECK-LABEL: @shl_oversized_mask(
+; CHECK-NEXT: [[SH:%.*]] = shl i8 %x, 6
+; CHECK-NEXT: [[MASK:%.*]] = and i8 [[SH]], -61
+; CHECK-NEXT: ret i8 [[MASK]]
+;
+ %sh = shl i8 %x, 6
+ %mask = and i8 %sh, 195 ; 0xC3
+ ret i8 %mask
+}
+
+define <2 x i8> @shl_undersized_mask_splat(<2 x i8> %x) {
+; CHECK-LABEL: @shl_undersized_mask_splat(
+; CHECK-NEXT: [[SH:%.*]] = shl <2 x i8> [[X:%.*]], <i8 6, i8 6>
+; CHECK-NEXT: [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -120, i8 -120>
+; CHECK-NEXT: ret <2 x i8> [[MASK]]
+;
+ %sh = shl <2 x i8> %x, <i8 6, i8 6>
+ %mask = and <2 x i8> %sh, <i8 136, i8 136> ; 0x88
+ ret <2 x i8> %mask
+}
+
diff --git a/test/Transforms/InstSimplify/apint-or.ll b/test/Transforms/InstSimplify/apint-or.ll
deleted file mode 100644
index e3dc2c48fb40..000000000000
--- a/test/Transforms/InstSimplify/apint-or.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
-; RUN: opt < %s -instsimplify -S | FileCheck %s
-
-; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0.
-define i39 @test1(i39 %V, i39 %M) {
-; CHECK-LABEL: @test1(
-; CHECK: [[N:%.*]] = and i39 %M, -274877906944
-; CHECK-NEXT: [[A:%.*]] = add i39 %V, [[N]]
-; CHECK-NEXT: ret i39 [[A]]
-;
- ;; If we have: ((V + N) & C1) | (V & C2)
- ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
- ;; replace with V+N.
- %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943
- %N = and i39 %M, 274877906944
- %A = add i39 %V, %N
- %B = and i39 %A, %C1
- %D = and i39 %V, 274877906943
- %R = or i39 %B, %D
- ret i39 %R
-}
-
-define i7 @test2(i7 %X) {
-; CHECK-LABEL: @test2(
-; CHECK: ret i7 %X
-;
- %Y = or i7 %X, 0
- ret i7 %Y
-}
-
-define i17 @test3(i17 %X) {
-; CHECK-LABEL: @test3(
-; CHECK: ret i17 -1
-;
- %Y = or i17 %X, -1
- ret i17 %Y
-}
-
-; Test the case where Integer BitWidth > 64 && BitWidth <= 1024.
-define i399 @test4(i399 %V, i399 %M) {
-; CHECK-LABEL: @test4(
-; CHECK: [[N:%.*]] = and i399 %M, 18446742974197923840
-; CHECK-NEXT: [[A:%.*]] = add i399 %V, [[N]]
-; CHECK-NEXT: ret i399 [[A]]
-;
- ;; If we have: ((V + N) & C1) | (V & C2)
- ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
- ;; replace with V+N.
- %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943
- %N = and i399 %M, 18446742974197923840
- %A = add i399 %V, %N
- %B = and i399 %A, %C1
- %D = and i399 %V, 274877906943
- %R = or i399 %B, %D
- ret i399 %R
-}
-
-define i777 @test5(i777 %X) {
-; CHECK-LABEL: @test5(
-; CHECK: ret i777 %X
-;
- %Y = or i777 %X, 0
- ret i777 %Y
-}
-
-define i117 @test6(i117 %X) {
-; CHECK-LABEL: @test6(
-; CHECK: ret i117 -1
-;
- %Y = or i117 %X, -1
- ret i117 %Y
-}
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index 883bf31ff77a..d6f1b634102f 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -598,11 +598,14 @@ define i1 @sdiv_exact_equality(i32 %Z) {
ret i1 %C
}
-; FIXME: But not other preds: PR32949 - https://bugs.llvm.org/show_bug.cgi?id=32949
+; But not other preds: PR32949 - https://bugs.llvm.org/show_bug.cgi?id=32949
define i1 @sdiv_exact_not_equality(i32 %Z) {
; CHECK-LABEL: @sdiv_exact_not_equality(
-; CHECK-NEXT: ret i1 true
+; CHECK-NEXT: [[A:%.*]] = sdiv exact i32 10, %Z
+; CHECK-NEXT: [[B:%.*]] = sdiv exact i32 20, %Z
+; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[A]], [[B]]
+; CHECK-NEXT: ret i1 [[C]]
;
%A = sdiv exact i32 10, %Z
%B = sdiv exact i32 20, %Z
diff --git a/test/Transforms/InstSimplify/or.ll b/test/Transforms/InstSimplify/or.ll
new file mode 100644
index 000000000000..2c5b6181bc6c
--- /dev/null
+++ b/test/Transforms/InstSimplify/or.ll
@@ -0,0 +1,181 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i32 @test1(i32 %A) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: ret i32 %A
+;
+ %B = or i32 %A, 0
+ ret i32 %B
+}
+
+define i32 @test2(i32 %A) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: ret i32 -1
+;
+ %B = or i32 %A, -1
+ ret i32 %B
+}
+
+define i8 @test2a(i8 %A) {
+; CHECK-LABEL: @test2a(
+; CHECK-NEXT: ret i8 -1
+;
+ %B = or i8 %A, -1
+ ret i8 %B
+}
+
+define i1 @test3(i1 %A) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT: ret i1 %A
+;
+ %B = or i1 %A, false
+ ret i1 %B
+}
+
+define i1 @test4(i1 %A) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: ret i1 true
+;
+ %B = or i1 %A, true
+ ret i1 %B
+}
+
+define i1 @test5(i1 %A) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT: ret i1 %A
+;
+ %B = or i1 %A, %A
+ ret i1 %B
+}
+
+define i32 @test6(i32 %A) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT: ret i32 %A
+;
+ %B = or i32 %A, %A
+ ret i32 %B
+}
+
+; A | ~A == -1
+define i32 @test7(i32 %A) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT: ret i32 -1
+;
+ %NotA = xor i32 %A, -1
+ %B = or i32 %A, %NotA
+ ret i32 %B
+}
+
+define i8 @test8(i8 %A) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT: ret i8 -1
+;
+ %B = or i8 %A, -2
+ %C = or i8 %B, 1
+ ret i8 %C
+}
+
+; Test that (A|c1)|(B|c2) == (A|B)|(c1|c2)
+define i8 @test9(i8 %A, i8 %B) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT: ret i8 -1
+;
+ %C = or i8 %A, 1
+ %D = or i8 %B, -2
+ %E = or i8 %C, %D
+ ret i8 %E
+}
+
+define i8 @test10(i8 %A) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT: ret i8 -2
+;
+ %B = or i8 %A, 1
+ %C = and i8 %B, -2
+ ; (X & C1) | C2 --> (X | C2) & (C1|C2)
+ %D = or i8 %C, -2
+ ret i8 %D
+}
+
+define i8 @test11(i8 %A) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT: ret i8 -1
+;
+ %B = or i8 %A, -2
+ %C = xor i8 %B, 13
+ ; (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
+ %D = or i8 %C, 1
+ %E = xor i8 %D, 12
+ ret i8 %E
+}
+
+; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0.
+define i39 @test1_apint(i39 %V, i39 %M) {
+; CHECK-LABEL: @test1_apint(
+; CHECK: [[N:%.*]] = and i39 %M, -274877906944
+; CHECK-NEXT: [[A:%.*]] = add i39 %V, [[N]]
+; CHECK-NEXT: ret i39 [[A]]
+;
+ ;; If we have: ((V + N) & C1) | (V & C2)
+ ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+ ;; replace with V+N.
+ %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943
+ %N = and i39 %M, 274877906944
+ %A = add i39 %V, %N
+ %B = and i39 %A, %C1
+ %D = and i39 %V, 274877906943
+ %R = or i39 %B, %D
+ ret i39 %R
+}
+
+define i7 @test2_apint(i7 %X) {
+; CHECK-LABEL: @test2_apint(
+; CHECK: ret i7 %X
+;
+ %Y = or i7 %X, 0
+ ret i7 %Y
+}
+
+define i17 @test3_apint(i17 %X) {
+; CHECK-LABEL: @test3_apint(
+; CHECK: ret i17 -1
+;
+ %Y = or i17 %X, -1
+ ret i17 %Y
+}
+
+; Test the case where Integer BitWidth > 64 && BitWidth <= 1024.
+define i399 @test4_apint(i399 %V, i399 %M) {
+; CHECK-LABEL: @test4_apint(
+; CHECK: [[N:%.*]] = and i399 %M, 18446742974197923840
+; CHECK-NEXT: [[A:%.*]] = add i399 %V, [[N]]
+; CHECK-NEXT: ret i399 [[A]]
+;
+ ;; If we have: ((V + N) & C1) | (V & C2)
+ ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+ ;; replace with V+N.
+ %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943
+ %N = and i399 %M, 18446742974197923840
+ %A = add i399 %V, %N
+ %B = and i399 %A, %C1
+ %D = and i399 %V, 274877906943
+ %R = or i399 %B, %D
+ ret i399 %R
+}
+
+define i777 @test5_apint(i777 %X) {
+; CHECK-LABEL: @test5_apint(
+; CHECK: ret i777 %X
+;
+ %Y = or i777 %X, 0
+ ret i777 %Y
+}
+
+define i117 @test6_apint(i117 %X) {
+; CHECK-LABEL: @test6_apint(
+; CHECK: ret i117 -1
+;
+ %Y = or i117 %X, -1
+ ret i117 %Y
+}
+
diff --git a/test/Transforms/LoopIdiom/ARM/ctlz.ll b/test/Transforms/LoopIdiom/ARM/ctlz.ll
new file mode 100644
index 000000000000..281d97c8c338
--- /dev/null
+++ b/test/Transforms/LoopIdiom/ARM/ctlz.ll
@@ -0,0 +1,185 @@
+; RUN: opt -loop-idiom -mtriple=armv7a < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s
+; RUN: opt -loop-idiom -mtriple=armv4t < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s
+
+; Recognize CTLZ builtin pattern.
+; Here we'll just convert loop to countable,
+; so do not insert builtin if CPU do not support CTLZ
+;
+; int ctlz_and_other(int n, char *a)
+; {
+; int i = 0, n0 = n;
+; while(n >>= 1) {
+; a[i] = (n0 & (1 << i)) ? 1 : 0;
+; i++;
+; }
+; return i;
+; }
+;
+; LZCNT: entry
+; LZCNT: %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true)
+; LZCNT-NEXT: %1 = sub i32 32, %0
+; LZCNT-NEXT: %2 = zext i32 %1 to i64
+; LZCNT: %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ]
+; LZCNT: %4 = trunc i64 %indvars.iv.next.lcssa to i32
+; LZCNT: %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ]
+; LZCNT: ret i32 %i.0.lcssa
+
+; NOLZCNT: entry
+; NOLZCNT-NOT: @llvm.ctlz
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) {
+entry:
+ %shr8 = ashr i32 %n, 1
+ %tobool9 = icmp eq i32 %shr8, 0
+ br i1 %tobool9, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
+ %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
+ %0 = trunc i64 %indvars.iv to i32
+ %shl = shl i32 1, %0
+ %and = and i32 %shl, %n
+ %tobool1 = icmp ne i32 %and, 0
+ %conv = zext i1 %tobool1 to i8
+ %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+ store i8 %conv, i8* %arrayidx, align 1
+ %indvars.iv.next = add nuw i64 %indvars.iv, 1
+ %shr = ashr i32 %shr11, 1
+ %tobool = icmp eq i32 %shr, 0
+ br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit: ; preds = %while.body
+ %1 = trunc i64 %indvars.iv.next to i32
+ br label %while.end
+
+while.end: ; preds = %while.end.loopexit, %entry
+ %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
+ ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check(int n)
+; {
+; int i = 0;
+; while(n) {
+; n >>= 1;
+; i++;
+; }
+; return i;
+; }
+;
+; ALL: entry
+; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
+; ALL-NEXT: %1 = sub i32 32, %0
+; ALL: %inc.lcssa = phi i32 [ %1, %while.body ]
+; ALL: %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+; ALL: ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check(i32 %n) {
+entry:
+ %tobool4 = icmp eq i32 %n, 0
+ br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+ %n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+ %shr = ashr i32 %n.addr.05, 1
+ %inc = add nsw i32 %i.06, 1
+ %tobool = icmp eq i32 %shr, 0
+ br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit: ; preds = %while.body
+ br label %while.end
+
+while.end: ; preds = %while.end.loopexit, %entry
+ %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+ ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz(int n)
+; {
+; int i = 0;
+; while(n >>= 1) {
+; i++;
+; }
+; return i;
+; }
+;
+; ALL: entry
+; ALL: %0 = ashr i32 %n, 1
+; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT: %2 = sub i32 32, %1
+; ALL-NEXT: %3 = add i32 %2, 1
+; ALL: %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL: ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz(i32 %n) {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.cond, %entry
+ %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+ %shr = ashr i32 %n.addr.0, 1
+ %tobool = icmp eq i32 %shr, 0
+ %inc = add nsw i32 %i.0, 1
+ br i1 %tobool, label %while.end, label %while.cond
+
+while.end: ; preds = %while.cond
+ ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add(int n, int i0)
+; {
+; int i = i0;
+; while(n >>= 1) {
+; i++;
+; }
+; return i;
+; }
+;
+; ALL: entry
+; ALL: %0 = ashr i32 %n, 1
+; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT: %2 = sub i32 32, %1
+; ALL-NEXT: %3 = add i32 %2, 1
+; ALL-NEXT: %4 = add i32 %2, %i0
+; ALL: %i.0.lcssa = phi i32 [ %4, %while.cond ]
+; ALL: ret i32 %i.0.lcssa
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add(i32 %n, i32 %i0) {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.cond, %entry
+ %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+ %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+ %shr = ashr i32 %n.addr.0, 1
+ %tobool = icmp eq i32 %shr, 0
+ %inc = add nsw i32 %i.0, 1
+ br i1 %tobool, label %while.end, label %while.cond
+
+while.end: ; preds = %while.cond
+ ret i32 %i.0
+}
diff --git a/test/Transforms/LoopIdiom/X86/ctlz.ll b/test/Transforms/LoopIdiom/X86/ctlz.ll
new file mode 100644
index 000000000000..d8daa3a9bbab
--- /dev/null
+++ b/test/Transforms/LoopIdiom/X86/ctlz.ll
@@ -0,0 +1,185 @@
+; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s
+; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s
+
+; Recognize CTLZ builtin pattern.
+; Here we'll just convert loop to countable,
+; so do not insert builtin if CPU do not support CTLZ
+;
+; int ctlz_and_other(int n, char *a)
+; {
+; int i = 0, n0 = n;
+; while(n >>= 1) {
+; a[i] = (n0 & (1 << i)) ? 1 : 0;
+; i++;
+; }
+; return i;
+; }
+;
+; LZCNT: entry
+; LZCNT: %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true)
+; LZCNT-NEXT: %1 = sub i32 32, %0
+; LZCNT-NEXT: %2 = zext i32 %1 to i64
+; LZCNT: %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ]
+; LZCNT: %4 = trunc i64 %indvars.iv.next.lcssa to i32
+; LZCNT: %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ]
+; LZCNT: ret i32 %i.0.lcssa
+
+; NOLZCNT: entry
+; NOLZCNT-NOT: @llvm.ctlz
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) {
+entry:
+ %shr8 = ashr i32 %n, 1
+ %tobool9 = icmp eq i32 %shr8, 0
+ br i1 %tobool9, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
+ %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
+ %0 = trunc i64 %indvars.iv to i32
+ %shl = shl i32 1, %0
+ %and = and i32 %shl, %n
+ %tobool1 = icmp ne i32 %and, 0
+ %conv = zext i1 %tobool1 to i8
+ %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+ store i8 %conv, i8* %arrayidx, align 1
+ %indvars.iv.next = add nuw i64 %indvars.iv, 1
+ %shr = ashr i32 %shr11, 1
+ %tobool = icmp eq i32 %shr, 0
+ br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit: ; preds = %while.body
+ %1 = trunc i64 %indvars.iv.next to i32
+ br label %while.end
+
+while.end: ; preds = %while.end.loopexit, %entry
+ %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
+ ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check(int n)
+; {
+; int i = 0;
+; while(n) {
+; n >>= 1;
+; i++;
+; }
+; return i;
+; }
+;
+; ALL: entry
+; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
+; ALL-NEXT: %1 = sub i32 32, %0
+; ALL: %inc.lcssa = phi i32 [ %1, %while.body ]
+; ALL: %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+; ALL: ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check(i32 %n) {
+entry:
+ %tobool4 = icmp eq i32 %n, 0
+ br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+ %n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+ %shr = ashr i32 %n.addr.05, 1
+ %inc = add nsw i32 %i.06, 1
+ %tobool = icmp eq i32 %shr, 0
+ br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit: ; preds = %while.body
+ br label %while.end
+
+while.end: ; preds = %while.end.loopexit, %entry
+ %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+ ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz(int n)
+; {
+; int i = 0;
+; while(n >>= 1) {
+; i++;
+; }
+; return i;
+; }
+;
+; ALL: entry
+; ALL: %0 = ashr i32 %n, 1
+; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT: %2 = sub i32 32, %1
+; ALL-NEXT: %3 = add i32 %2, 1
+; ALL: %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL: ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz(i32 %n) {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.cond, %entry
+ %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+ %shr = ashr i32 %n.addr.0, 1
+ %tobool = icmp eq i32 %shr, 0
+ %inc = add nsw i32 %i.0, 1
+ br i1 %tobool, label %while.end, label %while.cond
+
+while.end: ; preds = %while.cond
+ ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add(int n, int i0)
+; {
+; int i = i0;
+; while(n >>= 1) {
+; i++;
+; }
+; return i;
+; }
+;
+; ALL: entry
+; ALL: %0 = ashr i32 %n, 1
+; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT: %2 = sub i32 32, %1
+; ALL-NEXT: %3 = add i32 %2, 1
+; ALL-NEXT: %4 = add i32 %2, %i0
+; ALL: %i.0.lcssa = phi i32 [ %4, %while.cond ]
+; ALL: ret i32 %i.0.lcssa
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add(i32 %n, i32 %i0) {
+entry:
+ br label %while.cond
+
+while.cond: ; preds = %while.cond, %entry
+ %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+ %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+ %shr = ashr i32 %n.addr.0, 1
+ %tobool = icmp eq i32 %shr, 0
+ %inc = add nsw i32 %i.0, 1
+ br i1 %tobool, label %while.end, label %while.cond
+
+while.end: ; preds = %while.cond
+ ret i32 %i.0
+}
diff --git a/test/Transforms/LoopUnroll/not-rotated.ll b/test/Transforms/LoopUnroll/not-rotated.ll
index ffe80920d948..b4b88e096079 100644
--- a/test/Transforms/LoopUnroll/not-rotated.ll
+++ b/test/Transforms/LoopUnroll/not-rotated.ll
@@ -4,7 +4,7 @@
; properly handled by LoopUnroll, currently.
; RUN: opt -loop-unroll -verify-dom-info %s
-; REQUIRE: asserts
+; REQUIRES: asserts
define void @tinkywinky(i1 %patatino) {
entry:
diff --git a/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
new file mode 100644
index 000000000000..5a4bfe5e6bdd
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
@@ -0,0 +1,187 @@
+; RUN: opt -vector-library=SVML -loop-vectorize -S < %s | FileCheck %s
+
+; Test to verify that when math headers are built with
+; __FINITE_MATH_ONLY__ enabled, causing use of __<func>_finite
+; function versions, vectorization can map these to vector versions.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare float @__expf_finite(float) #0
+
+; CHECK-LABEL: @exp_f32
+; CHECK: <4 x float> @__svml_expf4
+; CHECK: ret
+define void @exp_f32(float* nocapture %varray) {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__expf_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+ store float %call, float* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__exp_finite(double) #0
+
+; CHECK-LABEL: @exp_f64
+; CHECK: <4 x double> @__svml_exp4
+; CHECK: ret
+define void @exp_f64(double* nocapture %varray) {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call fast double @__exp_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+ store double %call, double* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.vectorize.width", i32 4}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+
+
+declare float @__logf_finite(float) #0
+
+; CHECK-LABEL: @log_f32
+; CHECK: <4 x float> @__svml_logf4
+; CHECK: ret
+define void @log_f32(float* nocapture %varray) {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %call = tail call fast float @__logf_finite(float %conv)
+ %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+ store float %call, float* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+!21 = distinct !{!21, !22, !23}
+!22 = !{!"llvm.loop.vectorize.width", i32 4}
+!23 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__log_finite(double) #0
+
+; CHECK-LABEL: @log_f64
+; CHECK: <4 x double> @__svml_log4
+; CHECK: ret
+define void @log_f64(double* nocapture %varray) {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to double
+ %call = tail call fast double @__log_finite(double %conv)
+ %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+ store double %call, double* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+!31 = distinct !{!31, !32, !33}
+!32 = !{!"llvm.loop.vectorize.width", i32 4}
+!33 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare float @__powf_finite(float, float) #0
+
+; CHECK-LABEL: @pow_f32
+; CHECK: <4 x float> @__svml_powf4
+; CHECK: ret
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to float
+ %arrayidx = getelementptr inbounds float, float* %exp, i64 %indvars.iv
+ %tmp1 = load float, float* %arrayidx, align 4
+ %tmp2 = tail call fast float @__powf_finite(float %conv, float %tmp1)
+ %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+ store float %tmp2, float* %arrayidx2, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+!41 = distinct !{!41, !42, !43}
+!42 = !{!"llvm.loop.vectorize.width", i32 4}
+!43 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__pow_finite(double, double) #0
+
+; CHECK-LABEL: @pow_f64
+; CHECK: <4 x double> @__svml_pow4
+; CHECK: ret
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = trunc i64 %indvars.iv to i32
+ %conv = sitofp i32 %tmp to double
+ %arrayidx = getelementptr inbounds double, double* %exp, i64 %indvars.iv
+ %tmp1 = load double, double* %arrayidx, align 4
+ %tmp2 = tail call fast double @__pow_finite(double %conv, double %tmp1)
+ %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+ store double %tmp2, double* %arrayidx2, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+!51 = distinct !{!51, !52, !53}
+!52 = !{!"llvm.loop.vectorize.width", i32 4}
+!53 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index 6507166dd1f2..7e9e6b1cdc8e 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -849,3 +849,48 @@ for.end:
%tmp7 = phi i32 [ %tmp6, %for.inc ]
ret i32 %tmp7
}
+
+; Ensure that the shuffle vector for first order recurrence is inserted
+; correctly after all the phis. These new phis correspond to new IVs
+; that are generated by optimizing non-free truncs of IVs to IVs themselves
+define i64 @trunc_with_first_order_recurrence() {
+; CHECK-LABEL: trunc_with_first_order_recurrence
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT: %vec.phi = phi <2 x i64>
+; CHECK-NEXT: %vec.ind = phi <2 x i64> [ <i64 1, i64 2>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK-NEXT: %vec.ind2 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next3, %vector.body ]
+; CHECK-NEXT: %vector.recur = phi <2 x i32> [ <i32 undef, i32 42>, %vector.ph ], [ %vec.ind5, %vector.body ]
+; CHECK-NEXT: %vec.ind5 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next6, %vector.body ]
+; CHECK-NEXT: %vec.ind7 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next8, %vector.body ]
+; CHECK-NEXT: shufflevector <2 x i32> %vector.recur, <2 x i32> %vec.ind5, <2 x i32> <i32 1, i32 2>
+entry:
+ br label %loop
+
+exit: ; preds = %loop
+ %.lcssa = phi i64 [ %c23, %loop ]
+ ret i64 %.lcssa
+
+loop: ; preds = %loop, %entry
+ %c5 = phi i64 [ %c23, %loop ], [ 0, %entry ]
+ %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 1, %entry ]
+ %x = phi i32 [ %c24, %loop ], [ 1, %entry ]
+ %y = phi i32 [ %c6, %loop ], [ 42, %entry ]
+ %c6 = trunc i64 %indvars.iv to i32
+ %c8 = mul i32 %x, %c6
+ %c9 = add i32 %c8, 42
+ %c10 = add i32 %y, %c6
+ %c11 = add i32 %c10, %c9
+ %c12 = sext i32 %c11 to i64
+ %c13 = add i64 %c5, %c12
+ %indvars.iv.tr = trunc i64 %indvars.iv to i32
+ %c14 = shl i32 %indvars.iv.tr, 1
+ %c15 = add i32 %c9, %c14
+ %c16 = sext i32 %c15 to i64
+ %c23 = add i64 %c13, %c16
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %c24 = add nuw nsw i32 %x, 1
+ %exitcond.i = icmp eq i64 %indvars.iv.next, 114
+ br i1 %exitcond.i, label %exit, label %loop
+
+}
diff --git a/test/Transforms/LoopVectorize/pr32859.ll b/test/Transforms/LoopVectorize/pr32859.ll
new file mode 100644
index 000000000000..31cb84699f71
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr32859.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]'
+; but the IR Verifier requires for PHI one entry for each predecessor of
+; it's parent basic block. The original PR14725 solution for the issue just
+; added 'undef' for an predecessor BB and which is not correct. We copy the real
+; value for another predecessor instead of bringing 'undef'.
+
+; CHECK-LABEL: for.cond.preheader:
+; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ]
+
+; Function Attrs: nounwind uwtable
+define void @main() #0 {
+entry:
+ br label %for.cond1.preheader.i
+
+for.cond1.preheader.i: ; preds = %if.end.2.i, %entry
+ %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ]
+ %tobool.i = icmp ne i32 undef, 0
+ br label %if.end.2.i
+
+if.end.2.i: ; preds = %for.cond1.preheader.i
+ %inc5.i = add nsw i32 %c.06.i, 1
+ %cmp.i = icmp slt i32 %inc5.i, 16
+ br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader
+
+for.cond.preheader: ; preds = %if.end.2.i
+ %e.0.ph = phi i32 [ 0, %if.end.2.i ]
+ unreachable
+}
diff --git a/test/Transforms/NewGVN/pr32934.ll b/test/Transforms/NewGVN/pr32934.ll
new file mode 100644
index 000000000000..4bb7ea150437
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32934.ll
@@ -0,0 +1,69 @@
+; REQUIRES: disabled
+; RUN: opt -S -newgvn %s | FileCheck %s
+
+; CHECK: define void @tinkywinky() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %d = alloca i32, align 4
+; CHECK-NEXT: store i32 0, i32* null, align 4
+; CHECK-NEXT: br label %for.cond
+; CHECK: for.cond: ; preds = %if.end, %entry
+; CHECK-NEXT: %0 = load i32, i32* null, align 4
+; CHECK-NEXT: %cmp = icmp slt i32 %0, 1
+; CHECK-NEXT: br i1 %cmp, label %for.body, label %while.cond
+; CHECK: for.body: ; preds = %for.cond
+; CHECK-NEXT: %1 = load i32, i32* @a, align 4
+; CHECK-NEXT: store i32 %1, i32* %d, align 4
+; CHECK-NEXT: br label %L
+; CHECK: L: ; preds = %if.then, %for.body
+; CHECK-NEXT: %tobool = icmp ne i32 %1, 0
+; CHECK-NEXT: br i1 %tobool, label %if.then, label %if.end
+; CHECK: if.then: ; preds = %L
+; CHECK-NEXT: call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0))
+; CHECK-NEXT: br label %L
+; CHECK: if.end: ; preds = %L
+; CHECK-NEXT: br label %for.cond
+; CHECK: while.cond: ; preds = %while.body, %for.cond
+; CHECK-NEXT: br i1 undef, label %while.body, label %while.end
+; CHECK: while.body: ; preds = %while.cond
+; CHECK-NEXT: call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0))
+; CHECK-NEXT: br label %while.cond
+; CHECK: while.end:
+; CHECK-NEXT: %2 = load i32, i32* @a, align 4
+; CHECK-NEXT: store i32 %2, i32* undef, align 4
+; CHECK-NEXT: ret void
+
+@a = external global i32, align 4
+@patatino = external unnamed_addr constant [2 x i8], align 1
+define void @tinkywinky() {
+entry:
+ %d = alloca i32, align 4
+ store i32 0, i32* null, align 4
+ br label %for.cond
+for.cond:
+ %0 = load i32, i32* null, align 4
+ %cmp = icmp slt i32 %0, 1
+ br i1 %cmp, label %for.body, label %while.cond
+for.body:
+ %1 = load i32, i32* @a, align 4
+ store i32 %1, i32* %d, align 4
+ br label %L
+L:
+ %2 = load i32, i32* %d, align 4
+ %tobool = icmp ne i32 %2, 0
+ br i1 %tobool, label %if.then, label %if.end
+if.then:
+ call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0))
+ br label %L
+if.end:
+ br label %for.cond
+while.cond:
+ br i1 undef, label %while.body, label %while.end
+while.body:
+ call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0))
+ br label %while.cond
+while.end:
+ %3 = load i32, i32* @a, align 4
+ store i32 %3, i32* undef, align 4
+ ret void
+}
+declare void @printf(i8*, ...) #1
diff --git a/test/Transforms/NewGVN/pr32952.ll b/test/Transforms/NewGVN/pr32952.ll
new file mode 100644
index 000000000000..056b3a5105ec
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32952.ll
@@ -0,0 +1,42 @@
+; PR32952: Don't erroneously consider congruent two phi nodes which
+; have the same arguments but different incoming edges.
+; RUN: opt -newgvn -S %s | FileCheck %s
+
+@a = common global i16 0, align 2
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+define i32 @tinkywinky() {
+entry:
+ %0 = load i16, i16* @a, align 2
+ %conv = sext i16 %0 to i32
+ %neg = xor i32 %conv, -1
+ %conv1 = trunc i32 %neg to i16
+ %conv3 = zext i16 %conv1 to i32
+ %cmp = icmp slt i32 %conv, %conv3
+ br i1 %cmp, label %tinky, label %winky
+
+tinky:
+ store i16 2, i16* @a, align 2
+ br label %patatino
+
+winky:
+ br label %patatino
+
+patatino:
+; CHECK: %meh = phi i16 [ %0, %winky ], [ %conv1, %tinky ]
+; CHECK: %banana = phi i16 [ %0, %tinky ], [ %conv1, %winky ]
+ %meh = phi i16 [ %0, %winky ], [ %conv1, %tinky ]
+ %banana = phi i16 [ %0, %tinky ], [ %conv1, %winky ]
+ br label %end
+
+end:
+; CHECK: %promoted = zext i16 %banana to i32
+; CHECK: %other = zext i16 %meh to i32
+ %promoted = zext i16 %banana to i32
+ %other = zext i16 %meh to i32
+ %first = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %promoted)
+ %second = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %other)
+ ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/Transforms/NewGVN/verify-memoryphi.ll b/test/Transforms/NewGVN/verify-memoryphi.ll
new file mode 100644
index 000000000000..57dbd18986d2
--- /dev/null
+++ b/test/Transforms/NewGVN/verify-memoryphi.ll
@@ -0,0 +1,29 @@
+; Skip dead MemoryPhis when performing memory congruency verification
+; in NewGVN.
+; RUN: opt -S -newgvn %s | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: define void @tinkywinky() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label %body, label %end
+; CHECK: body:
+; CHECK-NEXT: store i8 undef, i8* null
+; CHECK-NEXT: br label %end
+; CHECK: end:
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+
+define void @tinkywinky() {
+entry:
+ call void @llvm.lifetime.start.p0i8(i64 4, i8* undef)
+ br i1 false, label %body, label %end
+
+body:
+ call void @llvm.lifetime.start.p0i8(i64 4, i8* undef)
+ br label %end
+
+end:
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll b/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
new file mode 100644
index 000000000000..edc8042a217d
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
@@ -0,0 +1,22 @@
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; Currently disabled for a few subtargets (e.g. Kryo):
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck --check-prefix=NO_SLP %s
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s
+
+define void @f(float* %r, float* %w) {
+ %r0 = getelementptr inbounds float, float* %r, i64 0
+ %r1 = getelementptr inbounds float, float* %r, i64 1
+ %f0 = load float, float* %r0
+ %f1 = load float, float* %r1
+ %add0 = fadd float %f0, %f0
+; CHECK: fadd <2 x float>
+; NO_SLP: fadd float
+; NO_SLP: fadd float
+ %add1 = fadd float %f1, %f1
+ %w0 = getelementptr inbounds float, float* %w, i64 0
+ %w1 = getelementptr inbounds float, float* %w, i64 1
+ store float %add0, float* %w0
+ store float %add1, float* %w1
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index e9b71963530c..962a6c3b57b3 100644
--- a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -1,4 +1,5 @@
-; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
@@ -23,7 +24,25 @@ target triple = "aarch64--linux-gnu"
; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32>
; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]]
; CHECK: sext i32 [[X]] to i64
-;
+
+; YAML: Pass: slp-vectorizer
+; YAML-NEXT: Name: VectorizedList
+; YAML-NEXT: Function: getelementptr_4x32
+; YAML-NEXT: Args:
+; YAML-NEXT: - String: 'SLP vectorized with cost '
+; YAML-NEXT: - Cost: '11'
+; YAML-NEXT: - String: ' and with tree size '
+; YAML-NEXT: - TreeSize: '5'
+
+; YAML: Pass: slp-vectorizer
+; YAML-NEXT: Name: VectorizedList
+; YAML-NEXT: Function: getelementptr_4x32
+; YAML-NEXT: Args:
+; YAML-NEXT: - String: 'SLP vectorized with cost '
+; YAML-NEXT: - Cost: '16'
+; YAML-NEXT: - String: ' and with tree size '
+; YAML-NEXT: - TreeSize: '3'
+
define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
entry:
%cmp31 = icmp sgt i32 %n, 0
@@ -69,7 +88,25 @@ for.body:
; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32>
; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]]
; CHECK: sext i32 [[X]] to i64
-;
+
+; YAML: Pass: slp-vectorizer
+; YAML-NEXT: Name: VectorizedList
+; YAML-NEXT: Function: getelementptr_2x32
+; YAML-NEXT: Args:
+; YAML-NEXT: - String: 'SLP vectorized with cost '
+; YAML-NEXT: - Cost: '11'
+; YAML-NEXT: - String: ' and with tree size '
+; YAML-NEXT: - TreeSize: '5'
+
+; YAML: Pass: slp-vectorizer
+; YAML-NEXT: Name: VectorizedList
+; YAML-NEXT: Function: getelementptr_2x32
+; YAML-NEXT: Args:
+; YAML-NEXT: - String: 'SLP vectorized with cost '
+; YAML-NEXT: - Cost: '6'
+; YAML-NEXT: - String: ' and with tree size '
+; YAML-NEXT: - TreeSize: '3'
+
define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
entry:
%cmp31 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 8f8bf2648aa2..1a6a2fb890d3 100644
--- a/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -1,4 +1,5 @@
-; RUN: opt -slp-vectorizer -slp-threshold=-6 -S < %s | FileCheck %s
+; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
; FIXME: The threshold is changed to keep this test case a bit smaller.
; The AArch64 cost model should not give such high costs to select statements.
@@ -10,6 +11,16 @@ target triple = "aarch64--linux"
; CHECK: load <4 x i32>
; CHECK: load <4 x i32>
; CHECK: select <4 x i1>
+
+; YAML: Pass: slp-vectorizer
+; YAML-NEXT: Name: VectorizedHorizontalReduction
+; YAML-NEXT: Function: test_select
+; YAML-NEXT: Args:
+; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
+; YAML-NEXT: - Cost: '4'
+; YAML-NEXT: - String: ' and with tree size '
+; YAML-NEXT: - TreeSize: '8'
+
define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
entry:
%cmp.22 = icmp sgt i32 %h, 0
@@ -93,6 +104,16 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia
; CHECK: load <4 x i32>
; CHECK: load <4 x i32>
; CHECK: mul nsw <4 x i32>
+
+; YAML: Pass: slp-vectorizer
+; YAML-NEXT: Name: VectorizedHorizontalReduction
+; YAML-NEXT: Function: reduction_with_br
+; YAML-NEXT: Args:
+; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
+; YAML-NEXT: - Cost: '1'
+; YAML-NEXT: - String: ' and with tree size '
+; YAML-NEXT: - TreeSize: '3'
+
entry:
%cmp.16 = icmp sgt i32 %h, 0
br i1 %cmp.16, label %for.body.lr.ph, label %for.end
@@ -150,6 +171,16 @@ for.end: ; preds = %for.end.loopexit, %
; CHECK: load <8 x i8>
; CHECK: load <8 x i8>
; CHECK: select <8 x i1>
+
+; YAML: Pass: slp-vectorizer
+; YAML-NEXT: Name: VectorizedHorizontalReduction
+; YAML-NEXT: Function: test_unrolled_select
+; YAML-NEXT: Args:
+; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
+; YAML-NEXT: - Cost: '-33'
+; YAML-NEXT: - String: ' and with tree size '
+; YAML-NEXT: - TreeSize: '10'
+
define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
entry:
%cmp.43 = icmp sgt i32 %h, 0
diff --git a/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/test/Transforms/SLPVectorizer/AArch64/remarks.ll
new file mode 100644
index 000000000000..e8c37512594e
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/remarks.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
+
+define void @f(double* %r, double* %w) {
+ %r0 = getelementptr inbounds double, double* %r, i64 0
+ %r1 = getelementptr inbounds double, double* %r, i64 1
+ %f0 = load double, double* %r0
+ %f1 = load double, double* %r1
+ %add0 = fadd double %f0, %f0
+ %add1 = fadd double %f1, %f1
+ %w0 = getelementptr inbounds double, double* %w, i64 0
+ %w1 = getelementptr inbounds double, double* %w, i64 1
+; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
+ store double %add0, double* %w0, !dbg !9
+ store double %add1, double* %w1
+ ret void
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"}
+!7 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: true, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 5, column: 10, scope: !7)
diff --git a/test/Transforms/SLPVectorizer/X86/arith-add.ll b/test/Transforms/SLPVectorizer/X86/arith-add.ll
new file mode 100644
index 000000000000..0266758b27d2
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -0,0 +1,649 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8 = common global [64 x i8] zeroinitializer, align 64
+@b8 = common global [64 x i8] zeroinitializer, align 64
+@c8 = common global [64 x i8] zeroinitializer, align 64
+
+define void @add_v8i64() {
+; SSE-LABEL: @add_v8i64(
+; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @add_v8i64(
+; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @add_v8i64(
+; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT: ret void
+;
+ %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+ %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+ %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+ %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+ %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+ %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+ %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+ %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+ %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+ %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+ %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+ %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+ %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+ %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+ %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+ %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+ %r0 = add i64 %a0, %b0
+ %r1 = add i64 %a1, %b1
+ %r2 = add i64 %a2, %b2
+ %r3 = add i64 %a3, %b3
+ %r4 = add i64 %a4, %b4
+ %r5 = add i64 %a5, %b5
+ %r6 = add i64 %a6, %b6
+ %r7 = add i64 %a7, %b7
+ store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+ store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+ store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+ store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+ store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+ store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+ store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+ store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+ ret void
+}
+
+define void @add_v16i32() {
+; SSE-LABEL: @add_v16i32(
+; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @add_v16i32(
+; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @add_v16i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT: ret void
+;
+ %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+ %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+ %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+ %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+ %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+ %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+ %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+ %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+ %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+ %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+ %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+ %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+ %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+ %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+ %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+ %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+ %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+ %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+ %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+ %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+ %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+ %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+ %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+ %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+ %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+ %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+ %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+ %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+ %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+ %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+ %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+ %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+ %r0 = add i32 %a0 , %b0
+ %r1 = add i32 %a1 , %b1
+ %r2 = add i32 %a2 , %b2
+ %r3 = add i32 %a3 , %b3
+ %r4 = add i32 %a4 , %b4
+ %r5 = add i32 %a5 , %b5
+ %r6 = add i32 %a6 , %b6
+ %r7 = add i32 %a7 , %b7
+ %r8 = add i32 %a8 , %b8
+ %r9 = add i32 %a9 , %b9
+ %r10 = add i32 %a10, %b10
+ %r11 = add i32 %a11, %b11
+ %r12 = add i32 %a12, %b12
+ %r13 = add i32 %a13, %b13
+ %r14 = add i32 %a14, %b14
+ %r15 = add i32 %a15, %b15
+ store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+ store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+ store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+ store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+ store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+ store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+ store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+ store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+ store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+ store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+ store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+ store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+ store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+ store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+ store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+ store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+ ret void
+}
+
+define void @add_v32i16() {
+; SSE-LABEL: @add_v32i16(
+; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @add_v32i16(
+; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @add_v32i16(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: ret void
+;
+ %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+ %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+ %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+ %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+ %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+ %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+ %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+ %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+ %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+ %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+ %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+ %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+ %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+ %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+ %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+ %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+ %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+ %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+ %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+ %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+ %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+ %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+ %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+ %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+ %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+ %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+ %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+ %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+ %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+ %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+ %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+ %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+ %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+ %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+ %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+ %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+ %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+ %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+ %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+ %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+ %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+ %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+ %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+ %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+ %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+ %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+ %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+ %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+ %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+ %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+ %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+ %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+ %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+ %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+ %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+ %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+ %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+ %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+ %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+ %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+ %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+ %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+ %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+ %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+ %r0 = add i16 %a0 , %b0
+ %r1 = add i16 %a1 , %b1
+ %r2 = add i16 %a2 , %b2
+ %r3 = add i16 %a3 , %b3
+ %r4 = add i16 %a4 , %b4
+ %r5 = add i16 %a5 , %b5
+ %r6 = add i16 %a6 , %b6
+ %r7 = add i16 %a7 , %b7
+ %r8 = add i16 %a8 , %b8
+ %r9 = add i16 %a9 , %b9
+ %r10 = add i16 %a10, %b10
+ %r11 = add i16 %a11, %b11
+ %r12 = add i16 %a12, %b12
+ %r13 = add i16 %a13, %b13
+ %r14 = add i16 %a14, %b14
+ %r15 = add i16 %a15, %b15
+ %r16 = add i16 %a16, %b16
+ %r17 = add i16 %a17, %b17
+ %r18 = add i16 %a18, %b18
+ %r19 = add i16 %a19, %b19
+ %r20 = add i16 %a20, %b20
+ %r21 = add i16 %a21, %b21
+ %r22 = add i16 %a22, %b22
+ %r23 = add i16 %a23, %b23
+ %r24 = add i16 %a24, %b24
+ %r25 = add i16 %a25, %b25
+ %r26 = add i16 %a26, %b26
+ %r27 = add i16 %a27, %b27
+ %r28 = add i16 %a28, %b28
+ %r29 = add i16 %a29, %b29
+ %r30 = add i16 %a30, %b30
+ %r31 = add i16 %a31, %b31
+ store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+ store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+ store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+ store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+ store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+ store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+ store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+ store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+ store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+ store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+ store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+ store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+ store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+ store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+ store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+ store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+ store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+ store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+ store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+ store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+ store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+ store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+ store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+ store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+ store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+ store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+ store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+ store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+ store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+ store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+ store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+ store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+ ret void
+}
+
+define void @add_v64i8() {
+; CHECK-LABEL: @add_v64i8(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: ret void
+;
+ %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+ %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+ %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+ %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+ %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+ %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+ %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+ %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+ %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+ %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+ %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+ %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+ %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+ %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+ %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+ %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+ %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+ %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+ %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+ %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+ %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+ %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+ %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+ %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+ %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+ %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+ %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+ %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+ %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+ %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+ %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+ %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+ %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+ %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+ %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+ %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+ %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+ %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+ %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+ %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+ %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+ %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+ %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+ %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+ %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+ %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+ %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+ %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+ %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+ %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+ %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+ %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+ %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+ %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+ %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+ %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+ %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+ %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+ %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+ %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+ %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+ %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+ %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+ %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+ %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+ %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+ %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+ %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+ %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+ %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+ %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+ %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+ %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+ %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+ %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+ %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+ %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+ %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+ %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+ %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+ %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+ %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+ %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+ %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+ %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+ %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+ %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+ %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+ %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+ %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+ %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+ %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+ %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+ %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+ %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+ %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+ %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+ %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+ %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+ %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+ %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+ %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+ %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+ %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+ %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+ %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+ %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+ %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+ %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+ %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+ %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+ %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+ %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+ %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+ %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+ %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+ %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+ %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+ %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+ %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+ %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+ %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+ %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+ %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+ %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+ %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+ %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+ %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+ %r0 = add i8 %a0 , %b0
+ %r1 = add i8 %a1 , %b1
+ %r2 = add i8 %a2 , %b2
+ %r3 = add i8 %a3 , %b3
+ %r4 = add i8 %a4 , %b4
+ %r5 = add i8 %a5 , %b5
+ %r6 = add i8 %a6 , %b6
+ %r7 = add i8 %a7 , %b7
+ %r8 = add i8 %a8 , %b8
+ %r9 = add i8 %a9 , %b9
+ %r10 = add i8 %a10, %b10
+ %r11 = add i8 %a11, %b11
+ %r12 = add i8 %a12, %b12
+ %r13 = add i8 %a13, %b13
+ %r14 = add i8 %a14, %b14
+ %r15 = add i8 %a15, %b15
+ %r16 = add i8 %a16, %b16
+ %r17 = add i8 %a17, %b17
+ %r18 = add i8 %a18, %b18
+ %r19 = add i8 %a19, %b19
+ %r20 = add i8 %a20, %b20
+ %r21 = add i8 %a21, %b21
+ %r22 = add i8 %a22, %b22
+ %r23 = add i8 %a23, %b23
+ %r24 = add i8 %a24, %b24
+ %r25 = add i8 %a25, %b25
+ %r26 = add i8 %a26, %b26
+ %r27 = add i8 %a27, %b27
+ %r28 = add i8 %a28, %b28
+ %r29 = add i8 %a29, %b29
+ %r30 = add i8 %a30, %b30
+ %r31 = add i8 %a31, %b31
+ %r32 = add i8 %a32, %b32
+ %r33 = add i8 %a33, %b33
+ %r34 = add i8 %a34, %b34
+ %r35 = add i8 %a35, %b35
+ %r36 = add i8 %a36, %b36
+ %r37 = add i8 %a37, %b37
+ %r38 = add i8 %a38, %b38
+ %r39 = add i8 %a39, %b39
+ %r40 = add i8 %a40, %b40
+ %r41 = add i8 %a41, %b41
+ %r42 = add i8 %a42, %b42
+ %r43 = add i8 %a43, %b43
+ %r44 = add i8 %a44, %b44
+ %r45 = add i8 %a45, %b45
+ %r46 = add i8 %a46, %b46
+ %r47 = add i8 %a47, %b47
+ %r48 = add i8 %a48, %b48
+ %r49 = add i8 %a49, %b49
+ %r50 = add i8 %a50, %b50
+ %r51 = add i8 %a51, %b51
+ %r52 = add i8 %a52, %b52
+ %r53 = add i8 %a53, %b53
+ %r54 = add i8 %a54, %b54
+ %r55 = add i8 %a55, %b55
+ %r56 = add i8 %a56, %b56
+ %r57 = add i8 %a57, %b57
+ %r58 = add i8 %a58, %b58
+ %r59 = add i8 %a59, %b59
+ %r60 = add i8 %a60, %b60
+ %r61 = add i8 %a61, %b61
+ %r62 = add i8 %a62, %b62
+ %r63 = add i8 %a63, %b63
+ store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+ store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+ store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+ store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+ store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+ store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+ store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+ store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+ store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+ store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+ store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+ store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+ store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+ store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+ store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+ store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+ store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+ store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+ store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+ store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+ store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+ store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+ store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+ store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+ store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+ store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+ store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+ store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+ store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+ store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+ store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+ store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+ store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+ store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+ store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+ store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+ store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+ store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+ store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+ store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+ store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+ store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+ store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+ store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+ store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+ store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+ store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+ store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+ store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+ store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+ store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+ store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+ store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+ store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+ store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+ store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+ store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+ store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+ store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+ store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+ store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+ store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+ store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+ store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/test/Transforms/SLPVectorizer/X86/arith-mul.ll
new file mode 100644
index 000000000000..95875d7f01fd
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/arith-mul.ll
@@ -0,0 +1,700 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8 = common global [64 x i8] zeroinitializer, align 64
+@b8 = common global [64 x i8] zeroinitializer, align 64
+@c8 = common global [64 x i8] zeroinitializer, align 64
+
+define void @mul_v8i64() {
+; SSE-LABEL: @mul_v8i64(
+; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]]
+; SSE-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]]
+; SSE-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]]
+; SSE-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]]
+; SSE-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]]
+; SSE-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]]
+; SSE-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]]
+; SSE-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]]
+; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT: ret void
+;
+; AVX1-LABEL: @mul_v8i64(
+; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; AVX1-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]]
+; AVX1-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]]
+; AVX1-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]]
+; AVX1-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]]
+; AVX1-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]]
+; AVX1-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]]
+; AVX1-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]]
+; AVX1-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]]
+; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; AVX1-NEXT: ret void
+;
+; AVX2-LABEL: @mul_v8i64(
+; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: ret void
+;
+; AVX512-LABEL: @mul_v8i64(
+; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP3:%.*]] = mul <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT: ret void
+;
+ %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+ %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+ %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+ %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+ %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+ %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+ %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+ %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+ %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+ %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+ %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+ %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+ %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+ %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+ %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+ %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+ %r0 = mul i64 %a0, %b0
+ %r1 = mul i64 %a1, %b1
+ %r2 = mul i64 %a2, %b2
+ %r3 = mul i64 %a3, %b3
+ %r4 = mul i64 %a4, %b4
+ %r5 = mul i64 %a5, %b5
+ %r6 = mul i64 %a6, %b6
+ %r7 = mul i64 %a7, %b7
+ store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+ store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+ store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+ store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+ store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+ store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+ store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+ store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+ ret void
+}
+
+define void @mul_v16i32() {
+; SSE-LABEL: @mul_v16i32(
+; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @mul_v16i32(
+; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP5:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = mul <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @mul_v16i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP3:%.*]] = mul <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT: ret void
+;
+ %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+ %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+ %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+ %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+ %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+ %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+ %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+ %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+ %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+ %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+ %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+ %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+ %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+ %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+ %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+ %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+ %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+ %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+ %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+ %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+ %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+ %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+ %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+ %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+ %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+ %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+ %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+ %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+ %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+ %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+ %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+ %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+ %r0 = mul i32 %a0 , %b0
+ %r1 = mul i32 %a1 , %b1
+ %r2 = mul i32 %a2 , %b2
+ %r3 = mul i32 %a3 , %b3
+ %r4 = mul i32 %a4 , %b4
+ %r5 = mul i32 %a5 , %b5
+ %r6 = mul i32 %a6 , %b6
+ %r7 = mul i32 %a7 , %b7
+ %r8 = mul i32 %a8 , %b8
+ %r9 = mul i32 %a9 , %b9
+ %r10 = mul i32 %a10, %b10
+ %r11 = mul i32 %a11, %b11
+ %r12 = mul i32 %a12, %b12
+ %r13 = mul i32 %a13, %b13
+ %r14 = mul i32 %a14, %b14
+ %r15 = mul i32 %a15, %b15
+ store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+ store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+ store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+ store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+ store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+ store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+ store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+ store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+ store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+ store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+ store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+ store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+ store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+ store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+ store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+ store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+ ret void
+}
+
+define void @mul_v32i16() {
+; SSE-LABEL: @mul_v32i16(
+; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @mul_v32i16(
+; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @mul_v32i16(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: ret void
+;
+ %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+ %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+ %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+ %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+ %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+ %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+ %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+ %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+ %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+ %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+ %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+ %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+ %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+ %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+ %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+ %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+ %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+ %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+ %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+ %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+ %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+ %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+ %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+ %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+ %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+ %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+ %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+ %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+ %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+ %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+ %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+ %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+ %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+ %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+ %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+ %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+ %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+ %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+ %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+ %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+ %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+ %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+ %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+ %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+ %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+ %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+ %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+ %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+ %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+ %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+ %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+ %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+ %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+ %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+ %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+ %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+ %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+ %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+ %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+ %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+ %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+ %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+ %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+ %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+ %r0 = mul i16 %a0 , %b0
+ %r1 = mul i16 %a1 , %b1
+ %r2 = mul i16 %a2 , %b2
+ %r3 = mul i16 %a3 , %b3
+ %r4 = mul i16 %a4 , %b4
+ %r5 = mul i16 %a5 , %b5
+ %r6 = mul i16 %a6 , %b6
+ %r7 = mul i16 %a7 , %b7
+ %r8 = mul i16 %a8 , %b8
+ %r9 = mul i16 %a9 , %b9
+ %r10 = mul i16 %a10, %b10
+ %r11 = mul i16 %a11, %b11
+ %r12 = mul i16 %a12, %b12
+ %r13 = mul i16 %a13, %b13
+ %r14 = mul i16 %a14, %b14
+ %r15 = mul i16 %a15, %b15
+ %r16 = mul i16 %a16, %b16
+ %r17 = mul i16 %a17, %b17
+ %r18 = mul i16 %a18, %b18
+ %r19 = mul i16 %a19, %b19
+ %r20 = mul i16 %a20, %b20
+ %r21 = mul i16 %a21, %b21
+ %r22 = mul i16 %a22, %b22
+ %r23 = mul i16 %a23, %b23
+ %r24 = mul i16 %a24, %b24
+ %r25 = mul i16 %a25, %b25
+ %r26 = mul i16 %a26, %b26
+ %r27 = mul i16 %a27, %b27
+ %r28 = mul i16 %a28, %b28
+ %r29 = mul i16 %a29, %b29
+ %r30 = mul i16 %a30, %b30
+ %r31 = mul i16 %a31, %b31
+ store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+ store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+ store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+ store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+ store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+ store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+ store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+ store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+ store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+ store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+ store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+ store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+ store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+ store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+ store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+ store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+ store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+ store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+ store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+ store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+ store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+ store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+ store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+ store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+ store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+ store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+ store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+ store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+ store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+ store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+ store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+ store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+ ret void
+}
+
+define void @mul_v64i8() {
+; CHECK-LABEL: @mul_v64i8(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: ret void
+;
+ %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+ %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+ %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+ %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+ %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+ %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+ %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+ %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+ %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+ %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+ %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+ %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+ %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+ %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+ %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+ %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+ %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+ %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+ %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+ %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+ %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+ %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+ %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+ %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+ %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+ %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+ %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+ %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+ %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+ %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+ %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+ %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+ %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+ %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+ %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+ %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+ %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+ %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+ %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+ %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+ %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+ %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+ %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+ %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+ %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+ %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+ %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+ %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+ %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+ %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+ %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+ %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+ %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+ %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+ %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+ %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+ %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+ %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+ %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+ %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+ %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+ %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+ %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+ %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+ %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+ %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+ %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+ %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+ %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+ %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+ %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+ %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+ %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+ %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+ %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+ %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+ %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+ %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+ %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+ %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+ %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+ %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+ %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+ %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+ %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+ %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+ %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+ %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+ %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+ %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+ %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+ %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+ %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+ %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+ %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+ %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+ %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+ %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+ %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+ %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+ %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+ %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+ %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+ %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+ %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+ %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+ %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+ %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+ %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+ %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+ %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+ %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+ %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+ %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+ %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+ %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+ %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+ %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+ %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+ %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+ %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+ %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+ %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+ %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+ %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+ %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+ %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+ %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+ %r0 = mul i8 %a0 , %b0
+ %r1 = mul i8 %a1 , %b1
+ %r2 = mul i8 %a2 , %b2
+ %r3 = mul i8 %a3 , %b3
+ %r4 = mul i8 %a4 , %b4
+ %r5 = mul i8 %a5 , %b5
+ %r6 = mul i8 %a6 , %b6
+ %r7 = mul i8 %a7 , %b7
+ %r8 = mul i8 %a8 , %b8
+ %r9 = mul i8 %a9 , %b9
+ %r10 = mul i8 %a10, %b10
+ %r11 = mul i8 %a11, %b11
+ %r12 = mul i8 %a12, %b12
+ %r13 = mul i8 %a13, %b13
+ %r14 = mul i8 %a14, %b14
+ %r15 = mul i8 %a15, %b15
+ %r16 = mul i8 %a16, %b16
+ %r17 = mul i8 %a17, %b17
+ %r18 = mul i8 %a18, %b18
+ %r19 = mul i8 %a19, %b19
+ %r20 = mul i8 %a20, %b20
+ %r21 = mul i8 %a21, %b21
+ %r22 = mul i8 %a22, %b22
+ %r23 = mul i8 %a23, %b23
+ %r24 = mul i8 %a24, %b24
+ %r25 = mul i8 %a25, %b25
+ %r26 = mul i8 %a26, %b26
+ %r27 = mul i8 %a27, %b27
+ %r28 = mul i8 %a28, %b28
+ %r29 = mul i8 %a29, %b29
+ %r30 = mul i8 %a30, %b30
+ %r31 = mul i8 %a31, %b31
+ %r32 = mul i8 %a32, %b32
+ %r33 = mul i8 %a33, %b33
+ %r34 = mul i8 %a34, %b34
+ %r35 = mul i8 %a35, %b35
+ %r36 = mul i8 %a36, %b36
+ %r37 = mul i8 %a37, %b37
+ %r38 = mul i8 %a38, %b38
+ %r39 = mul i8 %a39, %b39
+ %r40 = mul i8 %a40, %b40
+ %r41 = mul i8 %a41, %b41
+ %r42 = mul i8 %a42, %b42
+ %r43 = mul i8 %a43, %b43
+ %r44 = mul i8 %a44, %b44
+ %r45 = mul i8 %a45, %b45
+ %r46 = mul i8 %a46, %b46
+ %r47 = mul i8 %a47, %b47
+ %r48 = mul i8 %a48, %b48
+ %r49 = mul i8 %a49, %b49
+ %r50 = mul i8 %a50, %b50
+ %r51 = mul i8 %a51, %b51
+ %r52 = mul i8 %a52, %b52
+ %r53 = mul i8 %a53, %b53
+ %r54 = mul i8 %a54, %b54
+ %r55 = mul i8 %a55, %b55
+ %r56 = mul i8 %a56, %b56
+ %r57 = mul i8 %a57, %b57
+ %r58 = mul i8 %a58, %b58
+ %r59 = mul i8 %a59, %b59
+ %r60 = mul i8 %a60, %b60
+ %r61 = mul i8 %a61, %b61
+ %r62 = mul i8 %a62, %b62
+ %r63 = mul i8 %a63, %b63
+ store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+ store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+ store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+ store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+ store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+ store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+ store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+ store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+ store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+ store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+ store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+ store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+ store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+ store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+ store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+ store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+ store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+ store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+ store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+ store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+ store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+ store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+ store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+ store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+ store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+ store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+ store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+ store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+ store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+ store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+ store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+ store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+ store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+ store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+ store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+ store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+ store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+ store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+ store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+ store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+ store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+ store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+ store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+ store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+ store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+ store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+ store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+ store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+ store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+ store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+ store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+ store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+ store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+ store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+ store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+ store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+ store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+ store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+ store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+ store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+ store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+ store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+ store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+ store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/test/Transforms/SLPVectorizer/X86/arith-sub.ll
new file mode 100644
index 000000000000..85838369e226
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/arith-sub.ll
@@ -0,0 +1,649 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8 = common global [64 x i8] zeroinitializer, align 64
+@b8 = common global [64 x i8] zeroinitializer, align 64
+@c8 = common global [64 x i8] zeroinitializer, align 64
+
+define void @sub_v8i64() {
+; SSE-LABEL: @sub_v8i64(
+; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @sub_v8i64(
+; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT: [[TMP5:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = sub <4 x i64> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @sub_v8i64(
+; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP3:%.*]] = sub <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT: ret void
+;
+ %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+ %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+ %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+ %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+ %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+ %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+ %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+ %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+ %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+ %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+ %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+ %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+ %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+ %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+ %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+ %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+ %r0 = sub i64 %a0, %b0
+ %r1 = sub i64 %a1, %b1
+ %r2 = sub i64 %a2, %b2
+ %r3 = sub i64 %a3, %b3
+ %r4 = sub i64 %a4, %b4
+ %r5 = sub i64 %a5, %b5
+ %r6 = sub i64 %a6, %b6
+ %r7 = sub i64 %a7, %b7
+ store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+ store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+ store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+ store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+ store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+ store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+ store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+ store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+ ret void
+}
+
+define void @sub_v16i32() {
+; SSE-LABEL: @sub_v16i32(
+; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @sub_v16i32(
+; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP5:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @sub_v16i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP3:%.*]] = sub <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT: ret void
+;
+ %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+ %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+ %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+ %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+ %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+ %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+ %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+ %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+ %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+ %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+ %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+ %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+ %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+ %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+ %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+ %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+ %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+ %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+ %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+ %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+ %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+ %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+ %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+ %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+ %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+ %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+ %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+ %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+ %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+ %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+ %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+ %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+ %r0 = sub i32 %a0 , %b0
+ %r1 = sub i32 %a1 , %b1
+ %r2 = sub i32 %a2 , %b2
+ %r3 = sub i32 %a3 , %b3
+ %r4 = sub i32 %a4 , %b4
+ %r5 = sub i32 %a5 , %b5
+ %r6 = sub i32 %a6 , %b6
+ %r7 = sub i32 %a7 , %b7
+ %r8 = sub i32 %a8 , %b8
+ %r9 = sub i32 %a9 , %b9
+ %r10 = sub i32 %a10, %b10
+ %r11 = sub i32 %a11, %b11
+ %r12 = sub i32 %a12, %b12
+ %r13 = sub i32 %a13, %b13
+ %r14 = sub i32 %a14, %b14
+ %r15 = sub i32 %a15, %b15
+ store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+ store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+ store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+ store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+ store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+ store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+ store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+ store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+ store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+ store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+ store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+ store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+ store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+ store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+ store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+ store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+ ret void
+}
+
+define void @sub_v32i16() {
+; SSE-LABEL: @sub_v32i16(
+; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT: [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @sub_v32i16(
+; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @sub_v32i16(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: ret void
+;
+ %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+ %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+ %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+ %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+ %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+ %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+ %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+ %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+ %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+ %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+ %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+ %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+ %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+ %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+ %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+ %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+ %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+ %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+ %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+ %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+ %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+ %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+ %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+ %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+ %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+ %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+ %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+ %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+ %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+ %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+ %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+ %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+ %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+ %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+ %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+ %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+ %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+ %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+ %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+ %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+ %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+ %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+ %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+ %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+ %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+ %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+ %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+ %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+ %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+ %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+ %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+ %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+ %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+ %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+ %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+ %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+ %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+ %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+ %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+ %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+ %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+ %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+ %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+ %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+ %r0 = sub i16 %a0 , %b0
+ %r1 = sub i16 %a1 , %b1
+ %r2 = sub i16 %a2 , %b2
+ %r3 = sub i16 %a3 , %b3
+ %r4 = sub i16 %a4 , %b4
+ %r5 = sub i16 %a5 , %b5
+ %r6 = sub i16 %a6 , %b6
+ %r7 = sub i16 %a7 , %b7
+ %r8 = sub i16 %a8 , %b8
+ %r9 = sub i16 %a9 , %b9
+ %r10 = sub i16 %a10, %b10
+ %r11 = sub i16 %a11, %b11
+ %r12 = sub i16 %a12, %b12
+ %r13 = sub i16 %a13, %b13
+ %r14 = sub i16 %a14, %b14
+ %r15 = sub i16 %a15, %b15
+ %r16 = sub i16 %a16, %b16
+ %r17 = sub i16 %a17, %b17
+ %r18 = sub i16 %a18, %b18
+ %r19 = sub i16 %a19, %b19
+ %r20 = sub i16 %a20, %b20
+ %r21 = sub i16 %a21, %b21
+ %r22 = sub i16 %a22, %b22
+ %r23 = sub i16 %a23, %b23
+ %r24 = sub i16 %a24, %b24
+ %r25 = sub i16 %a25, %b25
+ %r26 = sub i16 %a26, %b26
+ %r27 = sub i16 %a27, %b27
+ %r28 = sub i16 %a28, %b28
+ %r29 = sub i16 %a29, %b29
+ %r30 = sub i16 %a30, %b30
+ %r31 = sub i16 %a31, %b31
+ store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+ store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+ store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+ store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+ store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+ store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+ store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+ store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+ store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+ store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+ store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+ store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+ store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+ store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+ store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+ store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+ store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+ store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+ store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+ store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+ store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+ store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+ store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+ store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+ store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+ store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+ store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+ store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+ store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+ store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+ store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+ store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+ ret void
+}
+
+define void @sub_v64i8() {
+; CHECK-LABEL: @sub_v64i8(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: ret void
+;
+ %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+ %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+ %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+ %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+ %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+ %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+ %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+ %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+ %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+ %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+ %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+ %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+ %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+ %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+ %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+ %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+ %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+ %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+ %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+ %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+ %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+ %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+ %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+ %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+ %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+ %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+ %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+ %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+ %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+ %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+ %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+ %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+ %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+ %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+ %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+ %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+ %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+ %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+ %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+ %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+ %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+ %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+ %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+ %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+ %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+ %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+ %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+ %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+ %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+ %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+ %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+ %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+ %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+ %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+ %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+ %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+ %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+ %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+ %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+ %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+ %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+ %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+ %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+ %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+ %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+ %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+ %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+ %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+ %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+ %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+ %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+ %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+ %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+ %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+ %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+ %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+ %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+ %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+ %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+ %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+ %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+ %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+ %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+ %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+ %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+ %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+ %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+ %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+ %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+ %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+ %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+ %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+ %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+ %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+ %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+ %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+ %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+ %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+ %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+ %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+ %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+ %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+ %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+ %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+ %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+ %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+ %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+ %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+ %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+ %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+ %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+ %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+ %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+ %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+ %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+ %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+ %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+ %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+ %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+ %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+ %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+ %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+ %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+ %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+ %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+ %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+ %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+ %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+ %r0 = sub i8 %a0 , %b0
+ %r1 = sub i8 %a1 , %b1
+ %r2 = sub i8 %a2 , %b2
+ %r3 = sub i8 %a3 , %b3
+ %r4 = sub i8 %a4 , %b4
+ %r5 = sub i8 %a5 , %b5
+ %r6 = sub i8 %a6 , %b6
+ %r7 = sub i8 %a7 , %b7
+ %r8 = sub i8 %a8 , %b8
+ %r9 = sub i8 %a9 , %b9
+ %r10 = sub i8 %a10, %b10
+ %r11 = sub i8 %a11, %b11
+ %r12 = sub i8 %a12, %b12
+ %r13 = sub i8 %a13, %b13
+ %r14 = sub i8 %a14, %b14
+ %r15 = sub i8 %a15, %b15
+ %r16 = sub i8 %a16, %b16
+ %r17 = sub i8 %a17, %b17
+ %r18 = sub i8 %a18, %b18
+ %r19 = sub i8 %a19, %b19
+ %r20 = sub i8 %a20, %b20
+ %r21 = sub i8 %a21, %b21
+ %r22 = sub i8 %a22, %b22
+ %r23 = sub i8 %a23, %b23
+ %r24 = sub i8 %a24, %b24
+ %r25 = sub i8 %a25, %b25
+ %r26 = sub i8 %a26, %b26
+ %r27 = sub i8 %a27, %b27
+ %r28 = sub i8 %a28, %b28
+ %r29 = sub i8 %a29, %b29
+ %r30 = sub i8 %a30, %b30
+ %r31 = sub i8 %a31, %b31
+ %r32 = sub i8 %a32, %b32
+ %r33 = sub i8 %a33, %b33
+ %r34 = sub i8 %a34, %b34
+ %r35 = sub i8 %a35, %b35
+ %r36 = sub i8 %a36, %b36
+ %r37 = sub i8 %a37, %b37
+ %r38 = sub i8 %a38, %b38
+ %r39 = sub i8 %a39, %b39
+ %r40 = sub i8 %a40, %b40
+ %r41 = sub i8 %a41, %b41
+ %r42 = sub i8 %a42, %b42
+ %r43 = sub i8 %a43, %b43
+ %r44 = sub i8 %a44, %b44
+ %r45 = sub i8 %a45, %b45
+ %r46 = sub i8 %a46, %b46
+ %r47 = sub i8 %a47, %b47
+ %r48 = sub i8 %a48, %b48
+ %r49 = sub i8 %a49, %b49
+ %r50 = sub i8 %a50, %b50
+ %r51 = sub i8 %a51, %b51
+ %r52 = sub i8 %a52, %b52
+ %r53 = sub i8 %a53, %b53
+ %r54 = sub i8 %a54, %b54
+ %r55 = sub i8 %a55, %b55
+ %r56 = sub i8 %a56, %b56
+ %r57 = sub i8 %a57, %b57
+ %r58 = sub i8 %a58, %b58
+ %r59 = sub i8 %a59, %b59
+ %r60 = sub i8 %a60, %b60
+ %r61 = sub i8 %a61, %b61
+ %r62 = sub i8 %a62, %b62
+ %r63 = sub i8 %a63, %b63
+ store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+ store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+ store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+ store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+ store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+ store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+ store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+ store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+ store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+ store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+ store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+ store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+ store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+ store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+ store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+ store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+ store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+ store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+ store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+ store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+ store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+ store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+ store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+ store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+ store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+ store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+ store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+ store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+ store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+ store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+ store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+ store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+ store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+ store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+ store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+ store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+ store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+ store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+ store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+ store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+ store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+ store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+ store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+ store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+ store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+ store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+ store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+ store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+ store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+ store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+ store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+ store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+ store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+ store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+ store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+ store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+ store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+ store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+ store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+ store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+ store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+ store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+ store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+ store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/shift-ashr.ll b/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
new file mode 100644
index 000000000000..646f599ce340
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
@@ -0,0 +1,913 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8 = common global [64 x i8] zeroinitializer, align 64
+@b8 = common global [64 x i8] zeroinitializer, align 64
+@c8 = common global [64 x i8] zeroinitializer, align 64
+
+define void @ashr_v8i64() {
+; SSE-LABEL: @ashr_v8i64(
+; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT: [[R0:%.*]] = ashr i64 [[A0]], [[B0]]
+; SSE-NEXT: [[R1:%.*]] = ashr i64 [[A1]], [[B1]]
+; SSE-NEXT: [[R2:%.*]] = ashr i64 [[A2]], [[B2]]
+; SSE-NEXT: [[R3:%.*]] = ashr i64 [[A3]], [[B3]]
+; SSE-NEXT: [[R4:%.*]] = ashr i64 [[A4]], [[B4]]
+; SSE-NEXT: [[R5:%.*]] = ashr i64 [[A5]], [[B5]]
+; SSE-NEXT: [[R6:%.*]] = ashr i64 [[A6]], [[B6]]
+; SSE-NEXT: [[R7:%.*]] = ashr i64 [[A7]], [[B7]]
+; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT: ret void
+;
+; AVX1-LABEL: @ashr_v8i64(
+; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; AVX1-NEXT: [[R0:%.*]] = ashr i64 [[A0]], [[B0]]
+; AVX1-NEXT: [[R1:%.*]] = ashr i64 [[A1]], [[B1]]
+; AVX1-NEXT: [[R2:%.*]] = ashr i64 [[A2]], [[B2]]
+; AVX1-NEXT: [[R3:%.*]] = ashr i64 [[A3]], [[B3]]
+; AVX1-NEXT: [[R4:%.*]] = ashr i64 [[A4]], [[B4]]
+; AVX1-NEXT: [[R5:%.*]] = ashr i64 [[A5]], [[B5]]
+; AVX1-NEXT: [[R6:%.*]] = ashr i64 [[A6]], [[B6]]
+; AVX1-NEXT: [[R7:%.*]] = ashr i64 [[A7]], [[B7]]
+; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; AVX1-NEXT: ret void
+;
+; AVX2-LABEL: @ashr_v8i64(
+; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: ret void
+;
+; AVX512-LABEL: @ashr_v8i64(
+; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT: ret void
+;
+; XOP-LABEL: @ashr_v8i64(
+; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT: ret void
+;
+ %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+ %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+ %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+ %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+ %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+ %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+ %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+ %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+ %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+ %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+ %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+ %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+ %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+ %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+ %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+ %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+ %r0 = ashr i64 %a0, %b0
+ %r1 = ashr i64 %a1, %b1
+ %r2 = ashr i64 %a2, %b2
+ %r3 = ashr i64 %a3, %b3
+ %r4 = ashr i64 %a4, %b4
+ %r5 = ashr i64 %a5, %b5
+ %r6 = ashr i64 %a6, %b6
+ %r7 = ashr i64 %a7, %b7
+ store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+ store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+ store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+ store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+ store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+ store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+ store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+ store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+ ret void
+}
+
+define void @ashr_v16i32() {
+; SSE-LABEL: @ashr_v16i32(
+; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SSE-NEXT: [[R0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SSE-NEXT: [[R1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SSE-NEXT: [[R2:%.*]] = ashr i32 [[A2]], [[B2]]
+; SSE-NEXT: [[R3:%.*]] = ashr i32 [[A3]], [[B3]]
+; SSE-NEXT: [[R4:%.*]] = ashr i32 [[A4]], [[B4]]
+; SSE-NEXT: [[R5:%.*]] = ashr i32 [[A5]], [[B5]]
+; SSE-NEXT: [[R6:%.*]] = ashr i32 [[A6]], [[B6]]
+; SSE-NEXT: [[R7:%.*]] = ashr i32 [[A7]], [[B7]]
+; SSE-NEXT: [[R8:%.*]] = ashr i32 [[A8]], [[B8]]
+; SSE-NEXT: [[R9:%.*]] = ashr i32 [[A9]], [[B9]]
+; SSE-NEXT: [[R10:%.*]] = ashr i32 [[A10]], [[B10]]
+; SSE-NEXT: [[R11:%.*]] = ashr i32 [[A11]], [[B11]]
+; SSE-NEXT: [[R12:%.*]] = ashr i32 [[A12]], [[B12]]
+; SSE-NEXT: [[R13:%.*]] = ashr i32 [[A13]], [[B13]]
+; SSE-NEXT: [[R14:%.*]] = ashr i32 [[A14]], [[B14]]
+; SSE-NEXT: [[R15:%.*]] = ashr i32 [[A15]], [[B15]]
+; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT: ret void
+;
+; AVX1-LABEL: @ashr_v16i32(
+; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; AVX1-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT: [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]]
+; AVX1-NEXT: [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]]
+; AVX1-NEXT: [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]]
+; AVX1-NEXT: [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]]
+; AVX1-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; AVX1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT: ret void
+;
+; AVX2-LABEL: @ashr_v16i32(
+; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX2-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX2-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX2-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX2-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT: ret void
+;
+; AVX512-LABEL: @ashr_v16i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT: ret void
+;
+; XOP-LABEL: @ashr_v16i32(
+; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT: ret void
+;
+ %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+ %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+ %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+ %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+ %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+ %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+ %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+ %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+ %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+ %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+ %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+ %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+ %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+ %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+ %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+ %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+ %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+ %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+ %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+ %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+ %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+ %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+ %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+ %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+ %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+ %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+ %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+ %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+ %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+ %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+ %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+ %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+ %r0 = ashr i32 %a0 , %b0
+ %r1 = ashr i32 %a1 , %b1
+ %r2 = ashr i32 %a2 , %b2
+ %r3 = ashr i32 %a3 , %b3
+ %r4 = ashr i32 %a4 , %b4
+ %r5 = ashr i32 %a5 , %b5
+ %r6 = ashr i32 %a6 , %b6
+ %r7 = ashr i32 %a7 , %b7
+ %r8 = ashr i32 %a8 , %b8
+ %r9 = ashr i32 %a9 , %b9
+ %r10 = ashr i32 %a10, %b10
+ %r11 = ashr i32 %a11, %b11
+ %r12 = ashr i32 %a12, %b12
+ %r13 = ashr i32 %a13, %b13
+ %r14 = ashr i32 %a14, %b14
+ %r15 = ashr i32 %a15, %b15
+ store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+ store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+ store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+ store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+ store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+ store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+ store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+ store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+ store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+ store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+ store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+ store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+ store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+ store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+ store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+ store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+ ret void
+}
+
+define void @ashr_v32i16() {
+; SSE-LABEL: @ashr_v32i16(
+; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT: [[R0:%.*]] = ashr i16 [[A0]], [[B0]]
+; SSE-NEXT: [[R1:%.*]] = ashr i16 [[A1]], [[B1]]
+; SSE-NEXT: [[R2:%.*]] = ashr i16 [[A2]], [[B2]]
+; SSE-NEXT: [[R3:%.*]] = ashr i16 [[A3]], [[B3]]
+; SSE-NEXT: [[R4:%.*]] = ashr i16 [[A4]], [[B4]]
+; SSE-NEXT: [[R5:%.*]] = ashr i16 [[A5]], [[B5]]
+; SSE-NEXT: [[R6:%.*]] = ashr i16 [[A6]], [[B6]]
+; SSE-NEXT: [[R7:%.*]] = ashr i16 [[A7]], [[B7]]
+; SSE-NEXT: [[R8:%.*]] = ashr i16 [[A8]], [[B8]]
+; SSE-NEXT: [[R9:%.*]] = ashr i16 [[A9]], [[B9]]
+; SSE-NEXT: [[R10:%.*]] = ashr i16 [[A10]], [[B10]]
+; SSE-NEXT: [[R11:%.*]] = ashr i16 [[A11]], [[B11]]
+; SSE-NEXT: [[R12:%.*]] = ashr i16 [[A12]], [[B12]]
+; SSE-NEXT: [[R13:%.*]] = ashr i16 [[A13]], [[B13]]
+; SSE-NEXT: [[R14:%.*]] = ashr i16 [[A14]], [[B14]]
+; SSE-NEXT: [[R15:%.*]] = ashr i16 [[A15]], [[B15]]
+; SSE-NEXT: [[R16:%.*]] = ashr i16 [[A16]], [[B16]]
+; SSE-NEXT: [[R17:%.*]] = ashr i16 [[A17]], [[B17]]
+; SSE-NEXT: [[R18:%.*]] = ashr i16 [[A18]], [[B18]]
+; SSE-NEXT: [[R19:%.*]] = ashr i16 [[A19]], [[B19]]
+; SSE-NEXT: [[R20:%.*]] = ashr i16 [[A20]], [[B20]]
+; SSE-NEXT: [[R21:%.*]] = ashr i16 [[A21]], [[B21]]
+; SSE-NEXT: [[R22:%.*]] = ashr i16 [[A22]], [[B22]]
+; SSE-NEXT: [[R23:%.*]] = ashr i16 [[A23]], [[B23]]
+; SSE-NEXT: [[R24:%.*]] = ashr i16 [[A24]], [[B24]]
+; SSE-NEXT: [[R25:%.*]] = ashr i16 [[A25]], [[B25]]
+; SSE-NEXT: [[R26:%.*]] = ashr i16 [[A26]], [[B26]]
+; SSE-NEXT: [[R27:%.*]] = ashr i16 [[A27]], [[B27]]
+; SSE-NEXT: [[R28:%.*]] = ashr i16 [[A28]], [[B28]]
+; SSE-NEXT: [[R29:%.*]] = ashr i16 [[A29]], [[B29]]
+; SSE-NEXT: [[R30:%.*]] = ashr i16 [[A30]], [[B30]]
+; SSE-NEXT: [[R31:%.*]] = ashr i16 [[A31]], [[B31]]
+; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @ashr_v32i16(
+; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @ashr_v32i16(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: ret void
+;
+; XOP-LABEL: @ashr_v32i16(
+; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT: ret void
+;
+ %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+ %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+ %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+ %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+ %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+ %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+ %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+ %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+ %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+ %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+ %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+ %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+ %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+ %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+ %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+ %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+ %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+ %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+ %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+ %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+ %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+ %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+ %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+ %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+ %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+ %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+ %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+ %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+ %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+ %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+ %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+ %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+ %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+ %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+ %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+ %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+ %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+ %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+ %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+ %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+ %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+ %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+ %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+ %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+ %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+ %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+ %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+ %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+ %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+ %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+ %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+ %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+ %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+ %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+ %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+ %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+ %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+ %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+ %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+ %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+ %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+ %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+ %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+ %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+ %r0 = ashr i16 %a0 , %b0
+ %r1 = ashr i16 %a1 , %b1
+ %r2 = ashr i16 %a2 , %b2
+ %r3 = ashr i16 %a3 , %b3
+ %r4 = ashr i16 %a4 , %b4
+ %r5 = ashr i16 %a5 , %b5
+ %r6 = ashr i16 %a6 , %b6
+ %r7 = ashr i16 %a7 , %b7
+ %r8 = ashr i16 %a8 , %b8
+ %r9 = ashr i16 %a9 , %b9
+ %r10 = ashr i16 %a10, %b10
+ %r11 = ashr i16 %a11, %b11
+ %r12 = ashr i16 %a12, %b12
+ %r13 = ashr i16 %a13, %b13
+ %r14 = ashr i16 %a14, %b14
+ %r15 = ashr i16 %a15, %b15
+ %r16 = ashr i16 %a16, %b16
+ %r17 = ashr i16 %a17, %b17
+ %r18 = ashr i16 %a18, %b18
+ %r19 = ashr i16 %a19, %b19
+ %r20 = ashr i16 %a20, %b20
+ %r21 = ashr i16 %a21, %b21
+ %r22 = ashr i16 %a22, %b22
+ %r23 = ashr i16 %a23, %b23
+ %r24 = ashr i16 %a24, %b24
+ %r25 = ashr i16 %a25, %b25
+ %r26 = ashr i16 %a26, %b26
+ %r27 = ashr i16 %a27, %b27
+ %r28 = ashr i16 %a28, %b28
+ %r29 = ashr i16 %a29, %b29
+ %r30 = ashr i16 %a30, %b30
+ %r31 = ashr i16 %a31, %b31
+ store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+ store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+ store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+ store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+ store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+ store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+ store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+ store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+ store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+ store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+ store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+ store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+ store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+ store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+ store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+ store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+ store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+ store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+ store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+ store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+ store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+ store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+ store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+ store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+ store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+ store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+ store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+ store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+ store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+ store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+ store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+ store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+ ret void
+}
+
+define void @ashr_v64i8() {
+; CHECK-LABEL: @ashr_v64i8(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: ret void
+;
+ %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+ %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+ %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+ %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+ %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+ %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+ %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+ %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+ %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+ %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+ %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+ %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+ %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+ %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+ %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+ %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+ %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+ %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+ %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+ %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+ %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+ %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+ %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+ %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+ %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+ %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+ %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+ %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+ %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+ %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+ %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+ %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+ %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+ %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+ %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+ %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+ %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+ %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+ %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+ %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+ %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+ %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+ %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+ %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+ %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+ %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+ %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+ %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+ %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+ %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+ %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+ %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+ %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+ %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+ %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+ %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+ %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+ %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+ %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+ %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+ %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+ %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+ %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+ %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+ %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+ %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+ %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+ %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+ %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+ %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+ %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+ %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+ %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+ %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+ %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+ %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+ %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+ %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+ %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+ %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+ %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+ %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+ %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+ %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+ %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+ %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+ %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+ %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+ %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+ %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+ %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+ %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+ %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+ %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+ %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+ %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+ %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+ %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+ %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+ %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+ %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+ %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+ %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+ %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+ %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+ %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+ %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+ %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+ %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+ %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+ %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+ %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+ %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+ %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+ %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+ %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+ %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+ %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+ %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+ %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+ %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+ %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+ %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+ %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+ %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+ %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+ %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+ %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+ %r0 = ashr i8 %a0 , %b0
+ %r1 = ashr i8 %a1 , %b1
+ %r2 = ashr i8 %a2 , %b2
+ %r3 = ashr i8 %a3 , %b3
+ %r4 = ashr i8 %a4 , %b4
+ %r5 = ashr i8 %a5 , %b5
+ %r6 = ashr i8 %a6 , %b6
+ %r7 = ashr i8 %a7 , %b7
+ %r8 = ashr i8 %a8 , %b8
+ %r9 = ashr i8 %a9 , %b9
+ %r10 = ashr i8 %a10, %b10
+ %r11 = ashr i8 %a11, %b11
+ %r12 = ashr i8 %a12, %b12
+ %r13 = ashr i8 %a13, %b13
+ %r14 = ashr i8 %a14, %b14
+ %r15 = ashr i8 %a15, %b15
+ %r16 = ashr i8 %a16, %b16
+ %r17 = ashr i8 %a17, %b17
+ %r18 = ashr i8 %a18, %b18
+ %r19 = ashr i8 %a19, %b19
+ %r20 = ashr i8 %a20, %b20
+ %r21 = ashr i8 %a21, %b21
+ %r22 = ashr i8 %a22, %b22
+ %r23 = ashr i8 %a23, %b23
+ %r24 = ashr i8 %a24, %b24
+ %r25 = ashr i8 %a25, %b25
+ %r26 = ashr i8 %a26, %b26
+ %r27 = ashr i8 %a27, %b27
+ %r28 = ashr i8 %a28, %b28
+ %r29 = ashr i8 %a29, %b29
+ %r30 = ashr i8 %a30, %b30
+ %r31 = ashr i8 %a31, %b31
+ %r32 = ashr i8 %a32, %b32
+ %r33 = ashr i8 %a33, %b33
+ %r34 = ashr i8 %a34, %b34
+ %r35 = ashr i8 %a35, %b35
+ %r36 = ashr i8 %a36, %b36
+ %r37 = ashr i8 %a37, %b37
+ %r38 = ashr i8 %a38, %b38
+ %r39 = ashr i8 %a39, %b39
+ %r40 = ashr i8 %a40, %b40
+ %r41 = ashr i8 %a41, %b41
+ %r42 = ashr i8 %a42, %b42
+ %r43 = ashr i8 %a43, %b43
+ %r44 = ashr i8 %a44, %b44
+ %r45 = ashr i8 %a45, %b45
+ %r46 = ashr i8 %a46, %b46
+ %r47 = ashr i8 %a47, %b47
+ %r48 = ashr i8 %a48, %b48
+ %r49 = ashr i8 %a49, %b49
+ %r50 = ashr i8 %a50, %b50
+ %r51 = ashr i8 %a51, %b51
+ %r52 = ashr i8 %a52, %b52
+ %r53 = ashr i8 %a53, %b53
+ %r54 = ashr i8 %a54, %b54
+ %r55 = ashr i8 %a55, %b55
+ %r56 = ashr i8 %a56, %b56
+ %r57 = ashr i8 %a57, %b57
+ %r58 = ashr i8 %a58, %b58
+ %r59 = ashr i8 %a59, %b59
+ %r60 = ashr i8 %a60, %b60
+ %r61 = ashr i8 %a61, %b61
+ %r62 = ashr i8 %a62, %b62
+ %r63 = ashr i8 %a63, %b63
+ store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+ store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+ store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+ store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+ store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+ store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+ store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+ store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+ store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+ store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+ store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+ store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+ store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+ store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+ store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+ store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+ store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+ store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+ store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+ store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+ store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+ store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+ store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+ store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+ store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+ store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+ store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+ store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+ store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+ store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+ store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+ store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+ store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+ store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+ store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+ store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+ store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+ store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+ store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+ store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+ store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+ store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+ store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+ store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+ store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+ store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+ store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+ store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+ store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+ store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+ store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+ store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+ store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+ store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+ store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+ store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+ store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+ store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+ store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+ store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+ store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+ store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+ store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+ store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
new file mode 100644
index 000000000000..6fd78e7c9699
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -0,0 +1,862 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8 = common global [64 x i8] zeroinitializer, align 64
+@b8 = common global [64 x i8] zeroinitializer, align 64
+@c8 = common global [64 x i8] zeroinitializer, align 64
+
+define void @lshr_v8i64() {
+; SSE-LABEL: @lshr_v8i64(
+; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: ret void
+;
+; AVX1-LABEL: @lshr_v8i64(
+; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
+; AVX1-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
+; AVX1-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
+; AVX1-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
+; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT: ret void
+;
+; AVX2-LABEL: @lshr_v8i64(
+; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: ret void
+;
+; AVX512-LABEL: @lshr_v8i64(
+; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT: ret void
+;
+; XOP-LABEL: @lshr_v8i64(
+; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT: ret void
+;
+ %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+ %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+ %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+ %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+ %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+ %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+ %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+ %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+ %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+ %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+ %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+ %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+ %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+ %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+ %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+ %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+ %r0 = lshr i64 %a0, %b0
+ %r1 = lshr i64 %a1, %b1
+ %r2 = lshr i64 %a2, %b2
+ %r3 = lshr i64 %a3, %b3
+ %r4 = lshr i64 %a4, %b4
+ %r5 = lshr i64 %a5, %b5
+ %r6 = lshr i64 %a6, %b6
+ %r7 = lshr i64 %a7, %b7
+ store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+ store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+ store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+ store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+ store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+ store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+ store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+ store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+ ret void
+}
+
+define void @lshr_v16i32() {
+; SSE-LABEL: @lshr_v16i32(
+; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SSE-NEXT: [[R0:%.*]] = lshr i32 [[A0]], [[B0]]
+; SSE-NEXT: [[R1:%.*]] = lshr i32 [[A1]], [[B1]]
+; SSE-NEXT: [[R2:%.*]] = lshr i32 [[A2]], [[B2]]
+; SSE-NEXT: [[R3:%.*]] = lshr i32 [[A3]], [[B3]]
+; SSE-NEXT: [[R4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT: [[R5:%.*]] = lshr i32 [[A5]], [[B5]]
+; SSE-NEXT: [[R6:%.*]] = lshr i32 [[A6]], [[B6]]
+; SSE-NEXT: [[R7:%.*]] = lshr i32 [[A7]], [[B7]]
+; SSE-NEXT: [[R8:%.*]] = lshr i32 [[A8]], [[B8]]
+; SSE-NEXT: [[R9:%.*]] = lshr i32 [[A9]], [[B9]]
+; SSE-NEXT: [[R10:%.*]] = lshr i32 [[A10]], [[B10]]
+; SSE-NEXT: [[R11:%.*]] = lshr i32 [[A11]], [[B11]]
+; SSE-NEXT: [[R12:%.*]] = lshr i32 [[A12]], [[B12]]
+; SSE-NEXT: [[R13:%.*]] = lshr i32 [[A13]], [[B13]]
+; SSE-NEXT: [[R14:%.*]] = lshr i32 [[A14]], [[B14]]
+; SSE-NEXT: [[R15:%.*]] = lshr i32 [[A15]], [[B15]]
+; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @lshr_v16i32(
+; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @lshr_v16i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP3:%.*]] = lshr <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT: ret void
+;
+; XOP-LABEL: @lshr_v16i32(
+; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT: ret void
+;
+ %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+ %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+ %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+ %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+ %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+ %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+ %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+ %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+ %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+ %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+ %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+ %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+ %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+ %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+ %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+ %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+ %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+ %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+ %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+ %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+ %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+ %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+ %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+ %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+ %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+ %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+ %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+ %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+ %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+ %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+ %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+ %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+ %r0 = lshr i32 %a0 , %b0
+ %r1 = lshr i32 %a1 , %b1
+ %r2 = lshr i32 %a2 , %b2
+ %r3 = lshr i32 %a3 , %b3
+ %r4 = lshr i32 %a4 , %b4
+ %r5 = lshr i32 %a5 , %b5
+ %r6 = lshr i32 %a6 , %b6
+ %r7 = lshr i32 %a7 , %b7
+ %r8 = lshr i32 %a8 , %b8
+ %r9 = lshr i32 %a9 , %b9
+ %r10 = lshr i32 %a10, %b10
+ %r11 = lshr i32 %a11, %b11
+ %r12 = lshr i32 %a12, %b12
+ %r13 = lshr i32 %a13, %b13
+ %r14 = lshr i32 %a14, %b14
+ %r15 = lshr i32 %a15, %b15
+ store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+ store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+ store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+ store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+ store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+ store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+ store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+ store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+ store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+ store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+ store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+ store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+ store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+ store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+ store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+ store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+ ret void
+}
+
+define void @lshr_v32i16() {
+; SSE-LABEL: @lshr_v32i16(
+; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT: [[R0:%.*]] = lshr i16 [[A0]], [[B0]]
+; SSE-NEXT: [[R1:%.*]] = lshr i16 [[A1]], [[B1]]
+; SSE-NEXT: [[R2:%.*]] = lshr i16 [[A2]], [[B2]]
+; SSE-NEXT: [[R3:%.*]] = lshr i16 [[A3]], [[B3]]
+; SSE-NEXT: [[R4:%.*]] = lshr i16 [[A4]], [[B4]]
+; SSE-NEXT: [[R5:%.*]] = lshr i16 [[A5]], [[B5]]
+; SSE-NEXT: [[R6:%.*]] = lshr i16 [[A6]], [[B6]]
+; SSE-NEXT: [[R7:%.*]] = lshr i16 [[A7]], [[B7]]
+; SSE-NEXT: [[R8:%.*]] = lshr i16 [[A8]], [[B8]]
+; SSE-NEXT: [[R9:%.*]] = lshr i16 [[A9]], [[B9]]
+; SSE-NEXT: [[R10:%.*]] = lshr i16 [[A10]], [[B10]]
+; SSE-NEXT: [[R11:%.*]] = lshr i16 [[A11]], [[B11]]
+; SSE-NEXT: [[R12:%.*]] = lshr i16 [[A12]], [[B12]]
+; SSE-NEXT: [[R13:%.*]] = lshr i16 [[A13]], [[B13]]
+; SSE-NEXT: [[R14:%.*]] = lshr i16 [[A14]], [[B14]]
+; SSE-NEXT: [[R15:%.*]] = lshr i16 [[A15]], [[B15]]
+; SSE-NEXT: [[R16:%.*]] = lshr i16 [[A16]], [[B16]]
+; SSE-NEXT: [[R17:%.*]] = lshr i16 [[A17]], [[B17]]
+; SSE-NEXT: [[R18:%.*]] = lshr i16 [[A18]], [[B18]]
+; SSE-NEXT: [[R19:%.*]] = lshr i16 [[A19]], [[B19]]
+; SSE-NEXT: [[R20:%.*]] = lshr i16 [[A20]], [[B20]]
+; SSE-NEXT: [[R21:%.*]] = lshr i16 [[A21]], [[B21]]
+; SSE-NEXT: [[R22:%.*]] = lshr i16 [[A22]], [[B22]]
+; SSE-NEXT: [[R23:%.*]] = lshr i16 [[A23]], [[B23]]
+; SSE-NEXT: [[R24:%.*]] = lshr i16 [[A24]], [[B24]]
+; SSE-NEXT: [[R25:%.*]] = lshr i16 [[A25]], [[B25]]
+; SSE-NEXT: [[R26:%.*]] = lshr i16 [[A26]], [[B26]]
+; SSE-NEXT: [[R27:%.*]] = lshr i16 [[A27]], [[B27]]
+; SSE-NEXT: [[R28:%.*]] = lshr i16 [[A28]], [[B28]]
+; SSE-NEXT: [[R29:%.*]] = lshr i16 [[A29]], [[B29]]
+; SSE-NEXT: [[R30:%.*]] = lshr i16 [[A30]], [[B30]]
+; SSE-NEXT: [[R31:%.*]] = lshr i16 [[A31]], [[B31]]
+; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @lshr_v32i16(
+; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @lshr_v32i16(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: ret void
+;
+; XOP-LABEL: @lshr_v32i16(
+; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT: ret void
+;
+ %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+ %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+ %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+ %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+ %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+ %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+ %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+ %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+ %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+ %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+ %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+ %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+ %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+ %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+ %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+ %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+ %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+ %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+ %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+ %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+ %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+ %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+ %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+ %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+ %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+ %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+ %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+ %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+ %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+ %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+ %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+ %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+ %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+ %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+ %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+ %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+ %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+ %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+ %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+ %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+ %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+ %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+ %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+ %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+ %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+ %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+ %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+ %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+ %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+ %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+ %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+ %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+ %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+ %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+ %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+ %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+ %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+ %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+ %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+ %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+ %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+ %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+ %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+ %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+ %r0 = lshr i16 %a0 , %b0
+ %r1 = lshr i16 %a1 , %b1
+ %r2 = lshr i16 %a2 , %b2
+ %r3 = lshr i16 %a3 , %b3
+ %r4 = lshr i16 %a4 , %b4
+ %r5 = lshr i16 %a5 , %b5
+ %r6 = lshr i16 %a6 , %b6
+ %r7 = lshr i16 %a7 , %b7
+ %r8 = lshr i16 %a8 , %b8
+ %r9 = lshr i16 %a9 , %b9
+ %r10 = lshr i16 %a10, %b10
+ %r11 = lshr i16 %a11, %b11
+ %r12 = lshr i16 %a12, %b12
+ %r13 = lshr i16 %a13, %b13
+ %r14 = lshr i16 %a14, %b14
+ %r15 = lshr i16 %a15, %b15
+ %r16 = lshr i16 %a16, %b16
+ %r17 = lshr i16 %a17, %b17
+ %r18 = lshr i16 %a18, %b18
+ %r19 = lshr i16 %a19, %b19
+ %r20 = lshr i16 %a20, %b20
+ %r21 = lshr i16 %a21, %b21
+ %r22 = lshr i16 %a22, %b22
+ %r23 = lshr i16 %a23, %b23
+ %r24 = lshr i16 %a24, %b24
+ %r25 = lshr i16 %a25, %b25
+ %r26 = lshr i16 %a26, %b26
+ %r27 = lshr i16 %a27, %b27
+ %r28 = lshr i16 %a28, %b28
+ %r29 = lshr i16 %a29, %b29
+ %r30 = lshr i16 %a30, %b30
+ %r31 = lshr i16 %a31, %b31
+ store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+ store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+ store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+ store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+ store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+ store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+ store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+ store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+ store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+ store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+ store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+ store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+ store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+ store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+ store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+ store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+ store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+ store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+ store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+ store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+ store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+ store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+ store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+ store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+ store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+ store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+ store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+ store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+ store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+ store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+ store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+ store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+ ret void
+}
+
+define void @lshr_v64i8() {
+; CHECK-LABEL: @lshr_v64i8(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: ret void
+;
+ %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+ %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+ %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+ %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+ %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+ %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+ %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+ %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+ %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+ %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+ %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+ %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+ %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+ %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+ %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+ %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+ %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+ %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+ %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+ %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+ %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+ %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+ %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+ %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+ %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+ %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+ %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+ %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+ %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+ %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+ %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+ %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+ %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+ %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+ %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+ %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+ %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+ %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+ %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+ %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+ %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+ %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+ %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+ %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+ %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+ %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+ %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+ %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+ %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+ %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+ %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+ %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+ %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+ %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+ %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+ %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+ %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+ %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+ %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+ %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+ %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+ %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+ %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+ %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+ %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+ %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+ %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+ %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+ %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+ %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+ %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+ %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+ %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+ %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+ %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+ %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+ %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+ %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+ %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+ %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+ %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+ %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+ %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+ %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+ %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+ %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+ %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+ %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+ %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+ %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+ %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+ %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+ %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+ %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+ %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+ %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+ %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+ %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+ %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+ %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+ %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+ %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+ %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+ %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+ %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+ %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+ %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+ %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+ %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+ %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+ %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+ %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+ %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+ %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+ %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+ %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+ %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+ %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+ %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+ %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+ %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+ %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+ %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+ %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+ %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+ %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+ %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+ %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+ %r0 = lshr i8 %a0 , %b0
+ %r1 = lshr i8 %a1 , %b1
+ %r2 = lshr i8 %a2 , %b2
+ %r3 = lshr i8 %a3 , %b3
+ %r4 = lshr i8 %a4 , %b4
+ %r5 = lshr i8 %a5 , %b5
+ %r6 = lshr i8 %a6 , %b6
+ %r7 = lshr i8 %a7 , %b7
+ %r8 = lshr i8 %a8 , %b8
+ %r9 = lshr i8 %a9 , %b9
+ %r10 = lshr i8 %a10, %b10
+ %r11 = lshr i8 %a11, %b11
+ %r12 = lshr i8 %a12, %b12
+ %r13 = lshr i8 %a13, %b13
+ %r14 = lshr i8 %a14, %b14
+ %r15 = lshr i8 %a15, %b15
+ %r16 = lshr i8 %a16, %b16
+ %r17 = lshr i8 %a17, %b17
+ %r18 = lshr i8 %a18, %b18
+ %r19 = lshr i8 %a19, %b19
+ %r20 = lshr i8 %a20, %b20
+ %r21 = lshr i8 %a21, %b21
+ %r22 = lshr i8 %a22, %b22
+ %r23 = lshr i8 %a23, %b23
+ %r24 = lshr i8 %a24, %b24
+ %r25 = lshr i8 %a25, %b25
+ %r26 = lshr i8 %a26, %b26
+ %r27 = lshr i8 %a27, %b27
+ %r28 = lshr i8 %a28, %b28
+ %r29 = lshr i8 %a29, %b29
+ %r30 = lshr i8 %a30, %b30
+ %r31 = lshr i8 %a31, %b31
+ %r32 = lshr i8 %a32, %b32
+ %r33 = lshr i8 %a33, %b33
+ %r34 = lshr i8 %a34, %b34
+ %r35 = lshr i8 %a35, %b35
+ %r36 = lshr i8 %a36, %b36
+ %r37 = lshr i8 %a37, %b37
+ %r38 = lshr i8 %a38, %b38
+ %r39 = lshr i8 %a39, %b39
+ %r40 = lshr i8 %a40, %b40
+ %r41 = lshr i8 %a41, %b41
+ %r42 = lshr i8 %a42, %b42
+ %r43 = lshr i8 %a43, %b43
+ %r44 = lshr i8 %a44, %b44
+ %r45 = lshr i8 %a45, %b45
+ %r46 = lshr i8 %a46, %b46
+ %r47 = lshr i8 %a47, %b47
+ %r48 = lshr i8 %a48, %b48
+ %r49 = lshr i8 %a49, %b49
+ %r50 = lshr i8 %a50, %b50
+ %r51 = lshr i8 %a51, %b51
+ %r52 = lshr i8 %a52, %b52
+ %r53 = lshr i8 %a53, %b53
+ %r54 = lshr i8 %a54, %b54
+ %r55 = lshr i8 %a55, %b55
+ %r56 = lshr i8 %a56, %b56
+ %r57 = lshr i8 %a57, %b57
+ %r58 = lshr i8 %a58, %b58
+ %r59 = lshr i8 %a59, %b59
+ %r60 = lshr i8 %a60, %b60
+ %r61 = lshr i8 %a61, %b61
+ %r62 = lshr i8 %a62, %b62
+ %r63 = lshr i8 %a63, %b63
+ store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+ store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+ store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+ store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+ store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+ store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+ store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+ store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+ store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+ store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+ store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+ store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+ store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+ store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+ store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+ store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+ store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+ store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+ store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+ store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+ store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+ store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+ store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+ store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+ store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+ store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+ store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+ store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+ store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+ store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+ store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+ store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+ store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+ store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+ store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+ store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+ store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+ store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+ store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+ store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+ store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+ store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+ store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+ store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+ store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+ store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+ store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+ store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+ store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+ store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+ store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+ store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+ store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+ store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+ store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+ store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+ store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+ store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+ store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+ store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+ store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+ store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+ store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+ store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/test/Transforms/SLPVectorizer/X86/shift-shl.ll
new file mode 100644
index 000000000000..70de82bdea5f
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -0,0 +1,814 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8 = common global [64 x i8] zeroinitializer, align 64
+@b8 = common global [64 x i8] zeroinitializer, align 64
+@c8 = common global [64 x i8] zeroinitializer, align 64
+
+define void @shl_v8i64() {
+; SSE-LABEL: @shl_v8i64(
+; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT: ret void
+;
+; AVX1-LABEL: @shl_v8i64(
+; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
+; AVX1-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
+; AVX1-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
+; AVX1-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
+; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT: ret void
+;
+; AVX2-LABEL: @shl_v8i64(
+; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT: ret void
+;
+; AVX512-LABEL: @shl_v8i64(
+; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT: ret void
+;
+; XOP-LABEL: @shl_v8i64(
+; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT: ret void
+;
+ %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+ %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+ %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+ %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+ %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+ %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+ %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+ %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+ %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+ %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+ %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+ %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+ %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+ %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+ %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+ %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+ %r0 = shl i64 %a0, %b0
+ %r1 = shl i64 %a1, %b1
+ %r2 = shl i64 %a2, %b2
+ %r3 = shl i64 %a3, %b3
+ %r4 = shl i64 %a4, %b4
+ %r5 = shl i64 %a5, %b5
+ %r6 = shl i64 %a6, %b6
+ %r7 = shl i64 %a7, %b7
+ store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+ store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+ store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+ store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+ store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+ store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+ store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+ store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+ ret void
+}
+
+define void @shl_v16i32() {
+; SSE-LABEL: @shl_v16i32(
+; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT: [[TMP10:%.*]] = shl <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT: [[TMP11:%.*]] = shl <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @shl_v16i32(
+; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @shl_v16i32(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT: [[TMP3:%.*]] = shl <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT: ret void
+;
+; XOP-LABEL: @shl_v16i32(
+; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT: [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT: ret void
+;
+ %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+ %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+ %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+ %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+ %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+ %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+ %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+ %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+ %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+ %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+ %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+ %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+ %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+ %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+ %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+ %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+ %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+ %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+ %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+ %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+ %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+ %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+ %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+ %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+ %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+ %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+ %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+ %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+ %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+ %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+ %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+ %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+ %r0 = shl i32 %a0 , %b0
+ %r1 = shl i32 %a1 , %b1
+ %r2 = shl i32 %a2 , %b2
+ %r3 = shl i32 %a3 , %b3
+ %r4 = shl i32 %a4 , %b4
+ %r5 = shl i32 %a5 , %b5
+ %r6 = shl i32 %a6 , %b6
+ %r7 = shl i32 %a7 , %b7
+ %r8 = shl i32 %a8 , %b8
+ %r9 = shl i32 %a9 , %b9
+ %r10 = shl i32 %a10, %b10
+ %r11 = shl i32 %a11, %b11
+ %r12 = shl i32 %a12, %b12
+ %r13 = shl i32 %a13, %b13
+ %r14 = shl i32 %a14, %b14
+ %r15 = shl i32 %a15, %b15
+ store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+ store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+ store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+ store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+ store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+ store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+ store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+ store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+ store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+ store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+ store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+ store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+ store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+ store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+ store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+ store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+ ret void
+}
+
+define void @shl_v32i16() {
+; SSE-LABEL: @shl_v32i16(
+; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT: [[R0:%.*]] = shl i16 [[A0]], [[B0]]
+; SSE-NEXT: [[R1:%.*]] = shl i16 [[A1]], [[B1]]
+; SSE-NEXT: [[R2:%.*]] = shl i16 [[A2]], [[B2]]
+; SSE-NEXT: [[R3:%.*]] = shl i16 [[A3]], [[B3]]
+; SSE-NEXT: [[R4:%.*]] = shl i16 [[A4]], [[B4]]
+; SSE-NEXT: [[R5:%.*]] = shl i16 [[A5]], [[B5]]
+; SSE-NEXT: [[R6:%.*]] = shl i16 [[A6]], [[B6]]
+; SSE-NEXT: [[R7:%.*]] = shl i16 [[A7]], [[B7]]
+; SSE-NEXT: [[R8:%.*]] = shl i16 [[A8]], [[B8]]
+; SSE-NEXT: [[R9:%.*]] = shl i16 [[A9]], [[B9]]
+; SSE-NEXT: [[R10:%.*]] = shl i16 [[A10]], [[B10]]
+; SSE-NEXT: [[R11:%.*]] = shl i16 [[A11]], [[B11]]
+; SSE-NEXT: [[R12:%.*]] = shl i16 [[A12]], [[B12]]
+; SSE-NEXT: [[R13:%.*]] = shl i16 [[A13]], [[B13]]
+; SSE-NEXT: [[R14:%.*]] = shl i16 [[A14]], [[B14]]
+; SSE-NEXT: [[R15:%.*]] = shl i16 [[A15]], [[B15]]
+; SSE-NEXT: [[R16:%.*]] = shl i16 [[A16]], [[B16]]
+; SSE-NEXT: [[R17:%.*]] = shl i16 [[A17]], [[B17]]
+; SSE-NEXT: [[R18:%.*]] = shl i16 [[A18]], [[B18]]
+; SSE-NEXT: [[R19:%.*]] = shl i16 [[A19]], [[B19]]
+; SSE-NEXT: [[R20:%.*]] = shl i16 [[A20]], [[B20]]
+; SSE-NEXT: [[R21:%.*]] = shl i16 [[A21]], [[B21]]
+; SSE-NEXT: [[R22:%.*]] = shl i16 [[A22]], [[B22]]
+; SSE-NEXT: [[R23:%.*]] = shl i16 [[A23]], [[B23]]
+; SSE-NEXT: [[R24:%.*]] = shl i16 [[A24]], [[B24]]
+; SSE-NEXT: [[R25:%.*]] = shl i16 [[A25]], [[B25]]
+; SSE-NEXT: [[R26:%.*]] = shl i16 [[A26]], [[B26]]
+; SSE-NEXT: [[R27:%.*]] = shl i16 [[A27]], [[B27]]
+; SSE-NEXT: [[R28:%.*]] = shl i16 [[A28]], [[B28]]
+; SSE-NEXT: [[R29:%.*]] = shl i16 [[A29]], [[B29]]
+; SSE-NEXT: [[R30:%.*]] = shl i16 [[A30]], [[B30]]
+; SSE-NEXT: [[R31:%.*]] = shl i16 [[A31]], [[B31]]
+; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT: ret void
+;
+; AVX-LABEL: @shl_v32i16(
+; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT: ret void
+;
+; AVX512-LABEL: @shl_v32i16(
+; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT: ret void
+;
+; XOP-LABEL: @shl_v32i16(
+; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT: ret void
+;
+ %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+ %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+ %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+ %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+ %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+ %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+ %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+ %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+ %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+ %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+ %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+ %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+ %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+ %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+ %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+ %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+ %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+ %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+ %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+ %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+ %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+ %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+ %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+ %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+ %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+ %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+ %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+ %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+ %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+ %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+ %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+ %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+ %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+ %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+ %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+ %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+ %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+ %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+ %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+ %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+ %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+ %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+ %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+ %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+ %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+ %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+ %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+ %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+ %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+ %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+ %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+ %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+ %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+ %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+ %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+ %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+ %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+ %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+ %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+ %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+ %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+ %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+ %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+ %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+ %r0 = shl i16 %a0 , %b0
+ %r1 = shl i16 %a1 , %b1
+ %r2 = shl i16 %a2 , %b2
+ %r3 = shl i16 %a3 , %b3
+ %r4 = shl i16 %a4 , %b4
+ %r5 = shl i16 %a5 , %b5
+ %r6 = shl i16 %a6 , %b6
+ %r7 = shl i16 %a7 , %b7
+ %r8 = shl i16 %a8 , %b8
+ %r9 = shl i16 %a9 , %b9
+ %r10 = shl i16 %a10, %b10
+ %r11 = shl i16 %a11, %b11
+ %r12 = shl i16 %a12, %b12
+ %r13 = shl i16 %a13, %b13
+ %r14 = shl i16 %a14, %b14
+ %r15 = shl i16 %a15, %b15
+ %r16 = shl i16 %a16, %b16
+ %r17 = shl i16 %a17, %b17
+ %r18 = shl i16 %a18, %b18
+ %r19 = shl i16 %a19, %b19
+ %r20 = shl i16 %a20, %b20
+ %r21 = shl i16 %a21, %b21
+ %r22 = shl i16 %a22, %b22
+ %r23 = shl i16 %a23, %b23
+ %r24 = shl i16 %a24, %b24
+ %r25 = shl i16 %a25, %b25
+ %r26 = shl i16 %a26, %b26
+ %r27 = shl i16 %a27, %b27
+ %r28 = shl i16 %a28, %b28
+ %r29 = shl i16 %a29, %b29
+ %r30 = shl i16 %a30, %b30
+ %r31 = shl i16 %a31, %b31
+ store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+ store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+ store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+ store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+ store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+ store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+ store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+ store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+ store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+ store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+ store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+ store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+ store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+ store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+ store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+ store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+ store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+ store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+ store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+ store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+ store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+ store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+ store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+ store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+ store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+ store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+ store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+ store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+ store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+ store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+ store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+ store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+ ret void
+}
+
+define void @shl_v64i8() {
+; CHECK-LABEL: @shl_v64i8(
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT: ret void
+;
+ %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+ %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+ %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+ %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+ %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+ %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+ %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+ %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+ %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+ %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+ %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+ %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+ %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+ %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+ %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+ %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+ %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+ %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+ %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+ %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+ %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+ %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+ %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+ %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+ %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+ %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+ %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+ %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+ %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+ %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+ %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+ %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+ %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+ %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+ %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+ %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+ %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+ %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+ %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+ %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+ %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+ %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+ %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+ %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+ %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+ %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+ %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+ %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+ %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+ %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+ %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+ %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+ %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+ %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+ %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+ %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+ %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+ %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+ %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+ %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+ %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+ %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+ %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+ %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+ %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+ %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+ %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+ %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+ %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+ %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+ %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+ %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+ %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+ %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+ %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+ %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+ %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+ %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+ %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+ %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+ %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+ %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+ %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+ %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+ %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+ %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+ %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+ %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+ %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+ %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+ %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+ %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+ %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+ %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+ %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+ %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+ %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+ %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+ %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+ %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+ %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+ %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+ %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+ %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+ %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+ %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+ %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+ %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+ %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+ %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+ %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+ %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+ %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+ %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+ %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+ %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+ %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+ %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+ %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+ %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+ %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+ %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+ %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+ %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+ %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+ %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+ %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+ %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+ %r0 = shl i8 %a0 , %b0
+ %r1 = shl i8 %a1 , %b1
+ %r2 = shl i8 %a2 , %b2
+ %r3 = shl i8 %a3 , %b3
+ %r4 = shl i8 %a4 , %b4
+ %r5 = shl i8 %a5 , %b5
+ %r6 = shl i8 %a6 , %b6
+ %r7 = shl i8 %a7 , %b7
+ %r8 = shl i8 %a8 , %b8
+ %r9 = shl i8 %a9 , %b9
+ %r10 = shl i8 %a10, %b10
+ %r11 = shl i8 %a11, %b11
+ %r12 = shl i8 %a12, %b12
+ %r13 = shl i8 %a13, %b13
+ %r14 = shl i8 %a14, %b14
+ %r15 = shl i8 %a15, %b15
+ %r16 = shl i8 %a16, %b16
+ %r17 = shl i8 %a17, %b17
+ %r18 = shl i8 %a18, %b18
+ %r19 = shl i8 %a19, %b19
+ %r20 = shl i8 %a20, %b20
+ %r21 = shl i8 %a21, %b21
+ %r22 = shl i8 %a22, %b22
+ %r23 = shl i8 %a23, %b23
+ %r24 = shl i8 %a24, %b24
+ %r25 = shl i8 %a25, %b25
+ %r26 = shl i8 %a26, %b26
+ %r27 = shl i8 %a27, %b27
+ %r28 = shl i8 %a28, %b28
+ %r29 = shl i8 %a29, %b29
+ %r30 = shl i8 %a30, %b30
+ %r31 = shl i8 %a31, %b31
+ %r32 = shl i8 %a32, %b32
+ %r33 = shl i8 %a33, %b33
+ %r34 = shl i8 %a34, %b34
+ %r35 = shl i8 %a35, %b35
+ %r36 = shl i8 %a36, %b36
+ %r37 = shl i8 %a37, %b37
+ %r38 = shl i8 %a38, %b38
+ %r39 = shl i8 %a39, %b39
+ %r40 = shl i8 %a40, %b40
+ %r41 = shl i8 %a41, %b41
+ %r42 = shl i8 %a42, %b42
+ %r43 = shl i8 %a43, %b43
+ %r44 = shl i8 %a44, %b44
+ %r45 = shl i8 %a45, %b45
+ %r46 = shl i8 %a46, %b46
+ %r47 = shl i8 %a47, %b47
+ %r48 = shl i8 %a48, %b48
+ %r49 = shl i8 %a49, %b49
+ %r50 = shl i8 %a50, %b50
+ %r51 = shl i8 %a51, %b51
+ %r52 = shl i8 %a52, %b52
+ %r53 = shl i8 %a53, %b53
+ %r54 = shl i8 %a54, %b54
+ %r55 = shl i8 %a55, %b55
+ %r56 = shl i8 %a56, %b56
+ %r57 = shl i8 %a57, %b57
+ %r58 = shl i8 %a58, %b58
+ %r59 = shl i8 %a59, %b59
+ %r60 = shl i8 %a60, %b60
+ %r61 = shl i8 %a61, %b61
+ %r62 = shl i8 %a62, %b62
+ %r63 = shl i8 %a63, %b63
+ store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+ store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+ store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+ store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+ store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+ store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+ store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+ store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+ store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+ store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+ store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+ store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+ store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+ store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+ store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+ store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+ store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+ store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+ store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+ store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+ store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+ store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+ store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+ store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+ store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+ store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+ store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+ store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+ store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+ store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+ store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+ store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+ store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+ store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+ store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+ store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+ store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+ store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+ store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+ store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+ store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+ store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+ store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+ store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+ store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+ store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+ store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+ store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+ store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+ store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+ store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+ store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+ store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+ store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+ store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+ store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+ store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+ store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+ store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+ store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+ store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+ store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+ store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+ store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+ ret void
+}
diff --git a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
index 42b4f3dea75b..3ac3c5138ae7 100644
--- a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
+++ b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
@@ -183,3 +183,202 @@ loop_exit3:
; CHECK: [[UNREACHABLE]]:
; CHECK-NEXT: unreachable
}
+
+; This test contains a trivially unswitchable branch with an LCSSA phi node in
+; a loop exit block.
+define i32 @test5(i1 %cond1, i32 %x, i32 %y) {
+; CHECK-LABEL: @test5(
+entry:
+ br label %loop_begin
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 %{{.*}}, label %entry.split, label %loop_exit
+;
+; CHECK: entry.split:
+; CHECK-NEXT: br label %loop_begin
+
+loop_begin:
+ br i1 %cond1, label %latch, label %loop_exit
+; CHECK: loop_begin:
+; CHECK-NEXT: br label %latch
+
+latch:
+ call void @some_func() noreturn nounwind
+ br label %loop_begin
+; CHECK: latch:
+; CHECK-NEXT: call
+; CHECK-NEXT: br label %loop_begin
+
+loop_exit:
+ %result1 = phi i32 [ %x, %loop_begin ]
+ %result2 = phi i32 [ %y, %loop_begin ]
+ %result = add i32 %result1, %result2
+ ret i32 %result
+; CHECK: loop_exit:
+; CHECK-NEXT: %[[R1:.*]] = phi i32 [ %x, %entry ]
+; CHECK-NEXT: %[[R2:.*]] = phi i32 [ %y, %entry ]
+; CHECK-NEXT: %[[R:.*]] = add i32 %[[R1]], %[[R2]]
+; CHECK-NEXT: ret i32 %[[R]]
+}
+
+; This test contains a trivially unswitchable branch with a real phi node in LCSSA
+; position in a shared exit block where a different path through the loop
+; produces a non-invariant input to the PHI node.
+define i32 @test6(i32* %var, i1 %cond1, i1 %cond2, i32 %x, i32 %y) {
+; CHECK-LABEL: @test6(
+entry:
+ br label %loop_begin
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 %{{.*}}, label %entry.split, label %loop_exit.split
+;
+; CHECK: entry.split:
+; CHECK-NEXT: br label %loop_begin
+
+loop_begin:
+ br i1 %cond1, label %continue, label %loop_exit
+; CHECK: loop_begin:
+; CHECK-NEXT: br label %continue
+
+continue:
+ %var_val = load i32, i32* %var
+ br i1 %cond2, label %latch, label %loop_exit
+; CHECK: continue:
+; CHECK-NEXT: load
+; CHECK-NEXT: br i1 %cond2, label %latch, label %loop_exit
+
+latch:
+ call void @some_func() noreturn nounwind
+ br label %loop_begin
+; CHECK: latch:
+; CHECK-NEXT: call
+; CHECK-NEXT: br label %loop_begin
+
+loop_exit:
+ %result1 = phi i32 [ %x, %loop_begin ], [ %var_val, %continue ]
+ %result2 = phi i32 [ %var_val, %continue ], [ %y, %loop_begin ]
+ %result = add i32 %result1, %result2
+ ret i32 %result
+; CHECK: loop_exit:
+; CHECK-NEXT: %[[R1:.*]] = phi i32 [ %var_val, %continue ]
+; CHECK-NEXT: %[[R2:.*]] = phi i32 [ %var_val, %continue ]
+; CHECK-NEXT: br label %loop_exit.split
+;
+; CHECK: loop_exit.split:
+; CHECK-NEXT: %[[R1S:.*]] = phi i32 [ %x, %entry ], [ %[[R1]], %loop_exit ]
+; CHECK-NEXT: %[[R2S:.*]] = phi i32 [ %y, %entry ], [ %[[R2]], %loop_exit ]
+; CHECK-NEXT: %[[R:.*]] = add i32 %[[R1S]], %[[R2S]]
+; CHECK-NEXT: ret i32 %[[R]]
+}
+
+; This test contains a trivially unswitchable switch with an LCSSA phi node in
+; a loop exit block.
+define i32 @test7(i32 %cond1, i32 %x, i32 %y) {
+; CHECK-LABEL: @test7(
+entry:
+ br label %loop_begin
+; CHECK-NEXT: entry:
+; CHECK-NEXT: switch i32 %cond1, label %entry.split [
+; CHECK-NEXT: i32 0, label %loop_exit
+; CHECK-NEXT: i32 1, label %loop_exit
+; CHECK-NEXT: ]
+;
+; CHECK: entry.split:
+; CHECK-NEXT: br label %loop_begin
+
+loop_begin:
+ switch i32 %cond1, label %latch [
+ i32 0, label %loop_exit
+ i32 1, label %loop_exit
+ ]
+; CHECK: loop_begin:
+; CHECK-NEXT: br label %latch
+
+latch:
+ call void @some_func() noreturn nounwind
+ br label %loop_begin
+; CHECK: latch:
+; CHECK-NEXT: call
+; CHECK-NEXT: br label %loop_begin
+
+loop_exit:
+ %result1 = phi i32 [ %x, %loop_begin ], [ %x, %loop_begin ]
+ %result2 = phi i32 [ %y, %loop_begin ], [ %y, %loop_begin ]
+ %result = add i32 %result1, %result2
+ ret i32 %result
+; CHECK: loop_exit:
+; CHECK-NEXT: %[[R1:.*]] = phi i32 [ %x, %entry ], [ %x, %entry ]
+; CHECK-NEXT: %[[R2:.*]] = phi i32 [ %y, %entry ], [ %y, %entry ]
+; CHECK-NEXT: %[[R:.*]] = add i32 %[[R1]], %[[R2]]
+; CHECK-NEXT: ret i32 %[[R]]
+}
+
+; This test contains a trivially unswitchable switch with a real phi node in
+; LCSSA position in a shared exit block where a different path through the loop
+; produces a non-invariant input to the PHI node.
+define i32 @test8(i32* %var, i32 %cond1, i32 %cond2, i32 %x, i32 %y) {
+; CHECK-LABEL: @test8(
+entry:
+ br label %loop_begin
+; CHECK-NEXT: entry:
+; CHECK-NEXT: switch i32 %cond1, label %entry.split [
+; CHECK-NEXT: i32 0, label %loop_exit.split
+; CHECK-NEXT: i32 1, label %loop_exit2
+; CHECK-NEXT: i32 2, label %loop_exit.split
+; CHECK-NEXT: ]
+;
+; CHECK: entry.split:
+; CHECK-NEXT: br label %loop_begin
+
+loop_begin:
+ switch i32 %cond1, label %continue [
+ i32 0, label %loop_exit
+ i32 1, label %loop_exit2
+ i32 2, label %loop_exit
+ ]
+; CHECK: loop_begin:
+; CHECK-NEXT: br label %continue
+
+continue:
+ %var_val = load i32, i32* %var
+ switch i32 %cond2, label %latch [
+ i32 0, label %loop_exit
+ ]
+; CHECK: continue:
+; CHECK-NEXT: load
+; CHECK-NEXT: switch i32 %cond2, label %latch [
+; CHECK-NEXT: i32 0, label %loop_exit
+; CHECK-NEXT: ]
+
+latch:
+ call void @some_func() noreturn nounwind
+ br label %loop_begin
+; CHECK: latch:
+; CHECK-NEXT: call
+; CHECK-NEXT: br label %loop_begin
+
+loop_exit:
+ %result1.1 = phi i32 [ %x, %loop_begin ], [ %x, %loop_begin ], [ %var_val, %continue ]
+ %result1.2 = phi i32 [ %var_val, %continue ], [ %y, %loop_begin ], [ %y, %loop_begin ]
+ %result1 = add i32 %result1.1, %result1.2
+ ret i32 %result1
+; CHECK: loop_exit:
+; CHECK-NEXT: %[[R1:.*]] = phi i32 [ %var_val, %continue ]
+; CHECK-NEXT: %[[R2:.*]] = phi i32 [ %var_val, %continue ]
+; CHECK-NEXT: br label %loop_exit.split
+;
+; CHECK: loop_exit.split:
+; CHECK-NEXT: %[[R1S:.*]] = phi i32 [ %x, %entry ], [ %x, %entry ], [ %[[R1]], %loop_exit ]
+; CHECK-NEXT: %[[R2S:.*]] = phi i32 [ %y, %entry ], [ %y, %entry ], [ %[[R2]], %loop_exit ]
+; CHECK-NEXT: %[[R:.*]] = add i32 %[[R1S]], %[[R2S]]
+; CHECK-NEXT: ret i32 %[[R]]
+
+loop_exit2:
+ %result2.1 = phi i32 [ %x, %loop_begin ]
+ %result2.2 = phi i32 [ %y, %loop_begin ]
+ %result2 = add i32 %result2.1, %result2.2
+ ret i32 %result2
+; CHECK: loop_exit2:
+; CHECK-NEXT: %[[R1:.*]] = phi i32 [ %x, %entry ]
+; CHECK-NEXT: %[[R2:.*]] = phi i32 [ %y, %entry ]
+; CHECK-NEXT: %[[R:.*]] = add i32 %[[R1]], %[[R2]]
+; CHECK-NEXT: ret i32 %[[R]]
+}
diff --git a/test/Transforms/SpeculativeExecution/spec-other.ll b/test/Transforms/SpeculativeExecution/spec-other.ll
deleted file mode 100644
index 65e14b69e9e6..000000000000
--- a/test/Transforms/SpeculativeExecution/spec-other.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: opt < %s -S -speculative-execution \
-; RUN: -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \
-; RUN: | FileCheck %s
-
-; CHECK-LABEL: @ifThen_extractvalue(
-; CHECK: extractvalue
-; CHECK: br i1 true
-define void @ifThen_extractvalue() {
- br i1 true, label %a, label %b
-
-a:
- %x = extractvalue { i32, i32 } undef, 0
- br label %b
-
-b:
- ret void
-}
-
-; CHECK-LABEL: @ifThen_insertvalue(
-; CHECK: insertvalue
-; CHECK: br i1 true
-define void @ifThen_insertvalue() {
- br i1 true, label %a, label %b
-
-a:
- %x = insertvalue { i32, i32 } undef, i32 undef, 0
- br label %b
-
-b:
- ret void
-}
-
diff --git a/test/Transforms/SpeculativeExecution/spec-vector.ll b/test/Transforms/SpeculativeExecution/spec-vector.ll
deleted file mode 100644
index 9c64f1fb1005..000000000000
--- a/test/Transforms/SpeculativeExecution/spec-vector.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: opt < %s -S -speculative-execution \
-; RUN: -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \
-; RUN: | FileCheck %s
-
-; CHECK-LABEL: @ifThen_extractelement_constindex(
-; CHECK: extractelement
-; CHECK: br i1 true
-define void @ifThen_extractelement_constindex() {
- br i1 true, label %a, label %b
-
-a:
- %x = extractelement <4 x i32> undef, i32 0
- br label %b
-
-b:
- ret void
-}
-
-; CHECK-LABEL: @ifThen_extractelement_varindex(
-; CHECK: extractelement
-; CHECK: br i1 true
-define void @ifThen_extractelement_varindex(i32 %idx) {
- br i1 true, label %a, label %b
-
-a:
- %x = extractelement <4 x i32> undef, i32 %idx
- br label %b
-
-b:
- ret void
-}
-
-; CHECK-LABEL: @ifThen_insertelement_constindex(
-; CHECK: insertelement
-; CHECK: br i1 true
-define void @ifThen_insertelement_constindex() {
- br i1 true, label %a, label %b
-
-a:
- %x = insertelement <4 x i32> undef, i32 undef, i32 0
- br label %b
-
-b:
- ret void
-}
-
-; CHECK-LABEL: @ifThen_insertelement_varindex(
-; CHECK: insertelement
-; CHECK: br i1 true
-define void @ifThen_insertelement_varindex(i32 %idx) {
- br i1 true, label %a, label %b
-
-a:
- %x = insertelement <4 x i32> undef, i32 undef, i32 %idx
- br label %b
-
-b:
- ret void
-}
-
-; CHECK-LABEL: @ifThen_shufflevector(
-; CHECK: shufflevector
-; CHECK: br i1 true
-define void @ifThen_shufflevector() {
- br i1 true, label %a, label %b
-
-a:
- %x = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> undef
- br label %b
-
-b:
- ret void
-}
diff --git a/test/Transforms/Util/split-bit-piece.ll b/test/Transforms/Util/split-bit-piece.ll
index 3d7bcac73ca3..5a374e839926 100644
--- a/test/Transforms/Util/split-bit-piece.ll
+++ b/test/Transforms/Util/split-bit-piece.ll
@@ -3,43 +3,85 @@
; if it only describes part of the variable.
; RUN: opt -S -sroa %s | FileCheck %s
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+; Built from:
+; struct foo { bool b; long i; };
+; void f(bool b, bool expr, foo g) {
+; }
+; And modifying the frag dbg.declare to use a fragmented DIExpression (with offset: 0, size: 4)
+; to test the dbg.declare+fragment case here.
-; Function Attrs: nounwind uwtable
-define hidden void @_ZN6__tsan9FastState14SetHistorySizeEi(i32 %hs) #1 align 2 {
+; Expect two fragments:
+; * first starting at bit 0, 8 bits (for the bool)
+; * second starting at bit 32, 32 bits (for the long)
+; (this happens to create/demonstrate a gap from bits [7, 32))
+
+; But also check that a complex expression is not used for a lone bool
+; parameter. It can reference the register it's in directly without masking off
+; high bits or anything
+
+; CHECK: call void @llvm.dbg.value(metadata i8 %g.coerce0, i64 0, metadata ![[VAR_STRUCT:[0-9]+]], metadata ![[EXPR_STRUCT1:[0-9]+]])
+; CHECK: call void @llvm.dbg.value(metadata i64 %g.coerce1, i64 0, metadata ![[VAR_STRUCT]], metadata ![[EXPR_STRUCT2:[0-9]+]])
+; CHECK: call void @llvm.dbg.value(metadata i1 %b, i64 0, metadata ![[VAR_BOOL:[0-9]+]], metadata ![[EXPR_BOOL:[0-9]+]])
+; CHECK: call void @llvm.dbg.value(metadata i1 %frag, i64 0, metadata ![[FRAG_BOOL:[0-9]+]], metadata ![[FRAG_BOOL:[0-9]+]])
+; CHECK: ![[EXPR_STRUCT1]] = !DIExpression(DW_OP_LLVM_fragment, 0, 8)
+; CHECK: ![[EXPR_STRUCT2]] = !DIExpression(DW_OP_LLVM_fragment, 32, 64)
+; CHECK: ![[EXPR_BOOL]] = !DIExpression()
+; CHECK: ![[FRAG_BOOL]] = !DIExpression(DW_OP_LLVM_fragment, 0, 1)
+
+%struct.foo = type { i8, i64 }
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) #0 !dbg !6 {
entry:
- %hs.addr = alloca i32, align 4
- %v1 = alloca i64, align 8
- %v2 = alloca i64, align 8
- store i32 %hs, i32* %hs.addr, align 4
-; CHECK: call void @llvm.dbg.value(metadata i32 %hs, i64 0, metadata !{{[0-9]+}}, metadata ![[EXPR:[0-9]+]])
-; CHECK: ![[EXPR]] = !DIExpression(DW_OP_LLVM_fragment, 0
- call void @llvm.dbg.declare(metadata i64* %v1, metadata !9, metadata !12), !dbg !13
- %0 = load i32, i32* %hs.addr, align 4
- %conv = sext i32 %0 to i64
- store i64 %conv, i64* %v1, align 8
- %1 = load i64, i64* %v2, align 8
- unreachable
+ %g = alloca %struct.foo, align 8
+ %b.addr = alloca i8, align 1
+ %frag.addr = alloca i8, align 1
+ %0 = bitcast %struct.foo* %g to { i8, i64 }*
+ %1 = getelementptr inbounds { i8, i64 }, { i8, i64 }* %0, i32 0, i32 0
+ store i8 %g.coerce0, i8* %1, align 8
+ %2 = getelementptr inbounds { i8, i64 }, { i8, i64 }* %0, i32 0, i32 1
+ store i64 %g.coerce1, i64* %2, align 8
+ %frombool = zext i1 %b to i8
+ store i8 %frombool, i8* %b.addr, align 1
+ call void @llvm.dbg.declare(metadata i8* %b.addr, metadata !15, metadata !16), !dbg !17
+ %frombool1 = zext i1 %frag to i8
+ store i8 %frombool1, i8* %frag.addr, align 1
+ call void @llvm.dbg.declare(metadata i8* %frag.addr, metadata !18, metadata !23), !dbg !19
+ call void @llvm.dbg.declare(metadata %struct.foo* %g, metadata !20, metadata !16), !dbg !21
+ ret void, !dbg !22
}
-attributes #0 = { nounwind readnone }
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!7}
-!llvm.ident = !{!8}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 256979) (llvm/trunk 257107)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2)
-!1 = !DIFile(filename: "tsan_shadow_test.cc", directory: "/tmp")
-!2 = !{!3, !5}
-!3 = !DICompositeType(tag: DW_TAG_class_type, name: "FastState", file: !4, line: 91, size: 64, align: 64, identifier: "_ZTSN6__tsan9FastStateE")
-!4 = !DIFile(filename: "/mnt/extra/llvm/projects/compiler-rt/lib/tsan/rtl/tsan_rtl.h", directory: "/tmp")
-!5 = distinct !DIDerivedType(tag: DW_TAG_typedef, name: "u64", line: 78, baseType: !6)
-!6 = !DIBasicType(name: "long long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
-!7 = !{i32 2, !"Debug Info Version", i32 3}
-!8 = !{!"clang version 3.8.0 (trunk 256979) (llvm/trunk 257107)"}
-!9 = !DILocalVariable(name: "v1", scope: !10, file: !4, line: 136, type: !5)
-!10 = distinct !DILexicalBlock(scope: !11, file: !4, line: 136, column: 5)
-!11 = distinct !DISubprogram(name: "SetHistorySize", linkageName: "_ZN6__tsan9FastState14SetHistorySizeEi", scope: !3, file: !4, line: 135, isLocal: false, isDefinition: true, scopeLine: 135, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
-!12 = !DIExpression()
-!13 = !DILocation(line: 136, column: 5, scope: !10)
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 303077) (llvm/trunk 303098)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "foo.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 5.0.0 (trunk 303077) (llvm/trunk 303098)"}
+!6 = distinct !DISubprogram(name: "f", linkageName: "_Z1fbb3foo", scope: !1, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null, !9, !9, !10}
+!9 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", file: !1, line: 1, size: 128, elements: !11, identifier: "_ZTS3foo")
+!11 = !{!12, !13}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !1, line: 1, baseType: !9, size: 8)
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !10, file: !1, line: 1, baseType: !14, size: 64, offset: 64)
+!14 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+!15 = !DILocalVariable(name: "b", arg: 1, scope: !6, file: !1, line: 2, type: !9)
+!16 = !DIExpression()
+!17 = !DILocation(line: 2, column: 13, scope: !6)
+!18 = !DILocalVariable(name: "frag", arg: 2, scope: !6, file: !1, line: 2, type: !9)
+!19 = !DILocation(line: 2, column: 21, scope: !6)
+!20 = !DILocalVariable(name: "g", arg: 3, scope: !6, file: !1, line: 2, type: !10)
+!21 = !DILocation(line: 2, column: 31, scope: !6)
+!22 = !DILocation(line: 3, column: 1, scope: !6)
+!23 = !DIExpression(DW_OP_LLVM_fragment, 0, 4)
diff --git a/test/Verifier/metadata-function-dbg.ll b/test/Verifier/metadata-function-dbg.ll
index 24989ed7aa2e..6db40943ec38 100644
--- a/test/Verifier/metadata-function-dbg.ll
+++ b/test/Verifier/metadata-function-dbg.ll
@@ -3,12 +3,18 @@
; CHECK: function declaration may not have a !dbg attachment
declare !dbg !4 void @f1()
-define void @f2() !dbg !4 {
+; CHECK: function must have a single !dbg attachment
+define void @f2() !dbg !4 !dbg !4 {
unreachable
}
-; CHECK: function must have a single !dbg attachment
-define void @f3() !dbg !4 !dbg !4 {
+; CHECK: DISubprogram attached to more than one function
+define void @f3() !dbg !4 {
+ unreachable
+}
+
+; CHECK: DISubprogram attached to more than one function
+define void @f4() !dbg !4 {
unreachable
}
@@ -16,7 +22,7 @@ define void @f3() !dbg !4 !dbg !4 {
; CHECK: function !dbg attachment must be a subprogram
; CHECK-NEXT: void ()* @bar
; CHECK-NEXT: !{{[0-9]+}} = !{}
-define void @bar() !dbg !6 {
+define void @bar() !dbg !3 {
unreachable
}
@@ -26,5 +32,5 @@ define void @bar() !dbg !6 {
!llvm.dbg.cu = !{!1}
!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2)
!2 = !DIFile(filename: "t.c", directory: "/path/to/dir")
+!3 = !{}
!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !2, unit: !1)
-!6 = !{}
diff --git a/test/tools/llvm-pdbdump/Inputs/FilterTest.cpp b/test/tools/llvm-pdbdump/Inputs/FilterTest.cpp
index bcf9360d4a9b..4dd5581e2fca 100644
--- a/test/tools/llvm-pdbdump/Inputs/FilterTest.cpp
+++ b/test/tools/llvm-pdbdump/Inputs/FilterTest.cpp
@@ -10,6 +10,8 @@ public:
void MemberFunc() {}
+ int foo() const { return IntMemberVar; }
+
private:
int IntMemberVar;
double DoubleMemberVar;
@@ -18,10 +20,26 @@ private:
int IntGlobalVar;
double DoubleGlobalVar;
typedef int GlobalTypedef;
+char OneByte;
+char TwoBytes[2];
+char ThreeBytes[3];
+
enum GlobalEnum {
GlobalEnumVal1
} GlobalEnumVar;
+int CFunc() {
+ return (int)OneByte * 2;
+}
+int BFunc() {
+ return 42;
+}
+int AFunc() {
+ static FilterTestClass FC;
+
+ return (CFunc() + BFunc()) * IntGlobalVar + FC.foo();
+}
+
int main(int argc, char **argv) {
FilterTestClass TestClass;
GlobalTypedef v1;
diff --git a/test/tools/llvm-pdbdump/Inputs/FilterTest.pdb b/test/tools/llvm-pdbdump/Inputs/FilterTest.pdb
index 5f01ec701b81..ce7e017f9151 100644
--- a/test/tools/llvm-pdbdump/Inputs/FilterTest.pdb
+++ b/test/tools/llvm-pdbdump/Inputs/FilterTest.pdb
Binary files differ
diff --git a/test/tools/llvm-pdbdump/regex-filter.test b/test/tools/llvm-pdbdump/regex-filter.test
index d2f500e88c33..36c3da33e2e4 100644
--- a/test/tools/llvm-pdbdump/regex-filter.test
+++ b/test/tools/llvm-pdbdump/regex-filter.test
@@ -1,4 +1,4 @@
-; RUN: llvm-pdbdump pretty -symbols -globals -types %p/Inputs/FilterTest.pdb \
+; RUN: llvm-pdbdump pretty -module-syms -globals -types %p/Inputs/FilterTest.pdb \
; RUN: | FileCheck --check-prefix=NO_FILTER %s
; RUN: llvm-pdbdump pretty -types -exclude-types="GlobalTypedef|NestedTypedef" \
@@ -11,15 +11,15 @@
; RUN: llvm-pdbdump pretty -classes -typedefs %p/Inputs/FilterTest.pdb \
; RUN: | FileCheck --check-prefix=EXCLUDE_ENUMS %s
-; RUN: llvm-pdbdump pretty -types -symbols -globals -exclude-symbols="MemberVar|GlobalVar" \
+; RUN: llvm-pdbdump pretty -types -module-syms -globals -exclude-symbols="MemberVar|GlobalVar" \
; RUN: %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=EXCLUDE_VARS %s
; RUN: llvm-pdbdump pretty -types -exclude-types="FilterTestClass" \
; RUN: %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=EXCLUDE_WHOLE_CLASS %s
-; RUN: llvm-pdbdump pretty -symbols -globals -exclude-compilands="FilterTest.obj" \
+; RUN: llvm-pdbdump pretty -module-syms -globals -exclude-compilands="FilterTest.obj" \
; RUN: %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=EXCLUDE_COMPILAND %s
; RUN: llvm-pdbdump pretty -types -include-types="FilterTestClass" \
; RUN: %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=INCLUDE_ONLY_TYPES %s
-; RUN: llvm-pdbdump pretty -types -symbols -globals -include-symbols="[[:<:]](IntGlobalVar|DoubleGlobalVar)[[:>:]]" \
+; RUN: llvm-pdbdump pretty -types -module-syms -globals -include-symbols="[[:<:]](IntGlobalVar|DoubleGlobalVar)[[:>:]]" \
; RUN: %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=INCLUDE_ONLY_VARS %s
; NO_FILTER: ---TYPES---
diff --git a/test/tools/llvm-pdbdump/symbol-filters.test b/test/tools/llvm-pdbdump/symbol-filters.test
new file mode 100644
index 000000000000..d12d2aa8be0f
--- /dev/null
+++ b/test/tools/llvm-pdbdump/symbol-filters.test
@@ -0,0 +1,74 @@
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=data %p/Inputs/FilterTest.pdb \
+; RUN: | FileCheck --check-prefix=ONLY_DATA %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=thunks %p/Inputs/FilterTest.pdb \
+; RUN: | FileCheck --check-prefix=ONLY_THUNKS %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=funcs %p/Inputs/FilterTest.pdb \
+; RUN: | FileCheck --check-prefix=ONLY_FUNCS %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=funcs -sym-types=data \
+; RUN: %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=TWO_TYPES %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=data \
+; RUN: -symbol-order=name %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=NAME_SORT_DATA %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=data \
+; RUN: -symbol-order=size %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=SIZE_SORT_DATA %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=funcs \
+; RUN: -symbol-order=name %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=NAME_SORT_FUNCS %s
+
+; RUN: llvm-pdbdump pretty -globals -module-syms -sym-types=funcs \
+; RUN: -symbol-order=size %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=SIZE_SORT_FUNCS %s
+
+; ONLY_DATA-NOT: func
+; ONLY_DATA-NOT: thunk
+; ONLY_DATA-DAG: data {{.*}} static char OneByte
+; ONLY_DATA-DAG: data {{.*}} static double DoubleGlobalVar
+; ONLY_DATA-DAG: data {{.*}} static char TwoBytes[2]
+; ONLY_DATA-DAG: data {{.*}} static char ThreeBytes[3]
+; ONLY_DATA-DAG: data {{.*}} static int IntGlobalVar
+; ONLY_DATA-DAG: data {{.*}} static GlobalEnum GlobalEnumVar
+
+; ONLY_FUNCS-NOT: data
+; ONLY_FUNCS-NOT: thunk
+; ONLY_FUNCS: func {{.*}} int __cdecl main(int argc, char** argv)
+; ONLY_FUNCS: func {{.*}} int __cdecl CFunc()
+; ONLY_FUNCS: func {{.*}} int __cdecl BFunc()
+; ONLY_FUNCS: func {{.*}} int __cdecl AFunc()
+; ONLY_FUNCS: func {{.*}} int FilterTestClass::foo()
+
+; ONLY_THUNKS-NOT: func
+; ONLY_THUNKS-NOT: data
+; ONLY_THUNKS-DAG: thunk {{.*}} (TrampIncremental)
+
+; TWO_TYPES-NOT: thunk
+; TWO_TYPES-DAG: func {{.*}} int __cdecl main(int argc, char** argv)
+; TWO_TYPES-DAG: data {{.*}} static double DoubleGlobalVar
+
+; NAME_SORT_DATA: data {{.*}} static double DoubleGlobalVar
+; NAME_SORT_DATA: data {{.*}} static GlobalEnum GlobalEnumVar
+; NAME_SORT_DATA: data {{.*}} static int IntGlobalVar
+; NAME_SORT_DATA: data {{.*}} static char OneByte
+; NAME_SORT_DATA: data {{.*}} static char ThreeBytes[3]
+; NAME_SORT_DATA: data {{.*}} static char TwoBytes[2]
+
+; SIZE_SORT_DATA: data {{.*}}sizeof=8{{.*}}double DoubleGlobalVar
+; SIZE_SORT_DATA-DAG: data {{.*}}sizeof=4{{.*}}GlobalEnum GlobalEnumVar
+; SIZE_SORT_DATA-DAG: data {{.*}}sizeof=4{{.*}}int IntGlobalVar
+; SIZE_SORT_DATA: data {{.*}}sizeof=3{{.*}}char ThreeBytes[3]
+; SIZE_SORT_DATA: data {{.*}}sizeof=2{{.*}}char TwoBytes[2]
+; SIZE_SORT_DATA: data {{.*}}sizeof=1{{.*}}char OneByte
+
+; NAME_SORT_FUNCS: func {{.*}}sizeof= 40{{.*}}AFunc
+; NAME_SORT_FUNCS: func {{.*}}sizeof= 10{{.*}}BFunc
+; NAME_SORT_FUNCS: func {{.*}}sizeof= 14{{.*}}CFunc
+; NAME_SORT_FUNCS: func {{.*}}sizeof= 16{{.*}}FilterTestClass::foo
+; NAME_SORT_FUNCS: func {{.*}}sizeof= 7{{.*}}main
+
+; SIZE_SORT_FUNCS: func {{.*}}sizeof= 40{{.*}}AFunc
+; SIZE_SORT_FUNCS: func {{.*}}sizeof= 16{{.*}}FilterTestClass::foo
+; SIZE_SORT_FUNCS: func {{.*}}sizeof= 14{{.*}}CFunc
+; SIZE_SORT_FUNCS: func {{.*}}sizeof= 10{{.*}}BFunc
+; SIZE_SORT_FUNCS: func {{.*}}sizeof= 7{{.*}}main
diff --git a/test/tools/llvm-profdata/sample-profile-basic.test b/test/tools/llvm-profdata/sample-profile-basic.test
index 211d8c5bbd84..3ba42c20f2e8 100644
--- a/test/tools/llvm-profdata/sample-profile-basic.test
+++ b/test/tools/llvm-profdata/sample-profile-basic.test
@@ -25,9 +25,10 @@ RUN: diff %t-binary %t-text
counters have doubled.
RUN: llvm-profdata merge --sample %p/Inputs/sample-profile.proftext -o %t-binprof
RUN: llvm-profdata merge --sample --text %p/Inputs/sample-profile.proftext %t-binprof -o - | FileCheck %s --check-prefix=MERGE1
-MERGE1-DAG: main:368038:0
-MERGE1-DAG: 9: 4128 _Z3fooi:1262 _Z3bari:2942
-MERGE1-DAG: _Z3fooi:15422:1220
+MERGE1: main:368038:0
+MERGE1: 9: 4128 _Z3fooi:1262 _Z3bari:2942
+MERGE1: _Z3bari:40602:2874
+MERGE1: _Z3fooi:15422:1220
5- Detect invalid text encoding (e.g. instrumentation profile text format).
RUN: not llvm-profdata show --sample %p/Inputs/foo3bar3-1.proftext 2>&1 | FileCheck %s --check-prefix=BADTEXT
diff --git a/test/tools/llvm-readobj/wasm-invalid.test b/test/tools/llvm-readobj/wasm-invalid.test
new file mode 100644
index 000000000000..d500d582ca03
--- /dev/null
+++ b/test/tools/llvm-readobj/wasm-invalid.test
@@ -0,0 +1,7 @@
+# RUN: yaml2obj %s | not llvm-readobj -t - 2>&1 | FileCheck %s
+
+--- !WASM
+FileHeader:
+ Version: 0x0000000c
+
+# CHECK: Error reading file: <stdin>: Bad version number
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 82c61b6e1be7..72872e83f792 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -232,8 +232,7 @@ static Constant *GetTorInit(std::vector<std::pair<Function *, int>> &TorList) {
std::vector<Constant *> ArrayElts;
Type *Int32Ty = Type::getInt32Ty(TorList[0].first->getContext());
- StructType *STy =
- StructType::get(Int32Ty, TorList[0].first->getType(), nullptr);
+ StructType *STy = StructType::get(Int32Ty, TorList[0].first->getType());
for (unsigned i = 0, e = TorList.size(); i != e; ++i) {
Constant *Elts[] = {ConstantInt::get(Int32Ty, TorList[i].second),
TorList[i].first};
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 7c81abaed755..8c786950036f 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -301,6 +301,8 @@ int main(int argc, char **argv) {
initializeConstantHoistingLegacyPassPass(*Registry);
initializeScalarOpts(*Registry);
initializeVectorization(*Registry);
+ initializeScalarizeMaskedMemIntrinPass(*Registry);
+ initializeExpandReductionsPass(*Registry);
// Register the target printer for --version.
cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
diff --git a/tools/lli/RemoteJITUtils.h b/tools/lli/RemoteJITUtils.h
index 89a514202567..3c82f73ff072 100644
--- a/tools/lli/RemoteJITUtils.h
+++ b/tools/lli/RemoteJITUtils.h
@@ -118,9 +118,8 @@ public:
MemMgr->registerEHFrames(Addr, LoadAddr, Size);
}
- void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
- size_t Size) override {
- MemMgr->deregisterEHFrames(Addr, LoadAddr, Size);
+ void deregisterEHFrames() override {
+ MemMgr->deregisterEHFrames();
}
bool finalizeMemory(std::string *ErrMsg = nullptr) override {
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 1519464521dd..3de260410bd9 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -16,7 +16,7 @@
#include "llvm/ADT/Triple.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
-#include "llvm/LibDriver/LibDriver.h"
+#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
#include "llvm/Object/Archive.h"
#include "llvm/Object/ArchiveWriter.h"
#include "llvm/Object/MachO.h"
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.cpp b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
index 2dd4ef0fb30d..e975a5220af6 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.cpp
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
@@ -180,7 +180,7 @@ private:
CompactTypeDumpVisitor CTDV(DB, Index, &P);
CVTypeVisitor Visitor(CTDV);
DictScope D(P, Label);
- if (DB.containsTypeIndex(Index)) {
+ if (DB.contains(Index)) {
CVType &Type = DB.getTypeRecord(Index);
if (auto EC = Visitor.visitTypeRecord(Type))
return EC;
diff --git a/tools/llvm-pdbdump/PrettyCompilandDumper.cpp b/tools/llvm-pdbdump/PrettyCompilandDumper.cpp
index 6257313e3e1a..9cf7bf82a164 100644
--- a/tools/llvm-pdbdump/PrettyCompilandDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyCompilandDumper.cpp
@@ -115,6 +115,8 @@ void CompilandDumper::start(const PDBSymbolCompiland &Symbol,
}
void CompilandDumper::dump(const PDBSymbolData &Symbol) {
+ if (!shouldDumpSymLevel(opts::pretty::SymLevel::Data))
+ return;
if (Printer.IsSymbolExcluded(Symbol.getName()))
return;
@@ -125,11 +127,17 @@ void CompilandDumper::dump(const PDBSymbolData &Symbol) {
Printer << "data: ";
WithColor(Printer, PDB_ColorItem::Address).get()
<< "[" << format_hex(Symbol.getVirtualAddress(), 10) << "]";
+
+ WithColor(Printer, PDB_ColorItem::Comment).get()
+ << " [sizeof = " << getTypeLength(Symbol) << "]";
+
break;
case PDB_LocType::Constant:
Printer << "constant: ";
WithColor(Printer, PDB_ColorItem::LiteralValue).get()
<< "[" << Symbol.getValue() << "]";
+ WithColor(Printer, PDB_ColorItem::Comment).get()
+ << " [sizeof = " << getTypeLength(Symbol) << "]";
break;
default:
Printer << "data(unexpected type=" << LocType << ")";
@@ -140,6 +148,8 @@ void CompilandDumper::dump(const PDBSymbolData &Symbol) {
}
void CompilandDumper::dump(const PDBSymbolFunc &Symbol) {
+ if (!shouldDumpSymLevel(opts::pretty::SymLevel::Functions))
+ return;
if (Symbol.getLength() == 0)
return;
if (Printer.IsSymbolExcluded(Symbol.getName()))
@@ -162,6 +172,8 @@ void CompilandDumper::dump(const PDBSymbolLabel &Symbol) {
}
void CompilandDumper::dump(const PDBSymbolThunk &Symbol) {
+ if (!shouldDumpSymLevel(opts::pretty::SymLevel::Thunks))
+ return;
if (Printer.IsSymbolExcluded(Symbol.getName()))
return;
diff --git a/tools/llvm-pdbdump/PrettyFunctionDumper.cpp b/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
index b0be33c157ce..8b2043989b81 100644
--- a/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
@@ -26,6 +26,7 @@
#include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
using namespace llvm;
using namespace llvm::codeview;
@@ -119,14 +120,19 @@ void FunctionDumper::start(const PDBSymbolFunc &Symbol, PointerType Pointer) {
WithColor(Printer, PDB_ColorItem::Address).get() << format_hex(FuncStart, 10);
if (auto DebugStart = Symbol.findOneChild<PDBSymbolFuncDebugStart>()) {
uint64_t Prologue = DebugStart->getVirtualAddress() - FuncStart;
- WithColor(Printer, PDB_ColorItem::Offset).get() << "+" << Prologue;
+ WithColor(Printer, PDB_ColorItem::Offset).get()
+ << formatv("+{0,2}", Prologue);
}
Printer << " - ";
WithColor(Printer, PDB_ColorItem::Address).get() << format_hex(FuncEnd, 10);
if (auto DebugEnd = Symbol.findOneChild<PDBSymbolFuncDebugEnd>()) {
uint64_t Epilogue = FuncEnd - DebugEnd->getVirtualAddress();
- WithColor(Printer, PDB_ColorItem::Offset).get() << "-" << Epilogue;
+ WithColor(Printer, PDB_ColorItem::Offset).get()
+ << formatv("-{0,2}", Epilogue);
}
+
+ WithColor(Printer, PDB_ColorItem::Comment).get()
+ << formatv(" | sizeof={0,3}", Symbol.getLength());
Printer << "] (";
if (Symbol.hasFramePointer()) {
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp
index 4cdd87620c86..0e5913fa3c93 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.cpp
+++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp
@@ -110,12 +110,22 @@ cl::list<std::string> InputFilenames(cl::Positional,
cl::opt<bool> Compilands("compilands", cl::desc("Display compilands"),
cl::cat(TypeCategory), cl::sub(PrettySubcommand));
-cl::opt<bool> Symbols("symbols", cl::desc("Display symbols for each compiland"),
+cl::opt<bool> Symbols("module-syms",
+ cl::desc("Display symbols for each compiland"),
cl::cat(TypeCategory), cl::sub(PrettySubcommand));
cl::opt<bool> Globals("globals", cl::desc("Dump global symbols"),
cl::cat(TypeCategory), cl::sub(PrettySubcommand));
cl::opt<bool> Externals("externals", cl::desc("Dump external symbols"),
cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+cl::list<SymLevel> SymTypes(
+ "sym-types", cl::desc("Type of symbols to dump (default all)"),
+ cl::cat(TypeCategory), cl::sub(PrettySubcommand), cl::ZeroOrMore,
+ cl::values(
+ clEnumValN(SymLevel::Thunks, "thunks", "Display thunk symbols"),
+ clEnumValN(SymLevel::Data, "data", "Display data symbols"),
+ clEnumValN(SymLevel::Functions, "funcs", "Display function symbols"),
+ clEnumValN(SymLevel::All, "all", "Display all symbols (default)")));
+
cl::opt<bool>
Types("types",
cl::desc("Display all types (implies -classes, -enums, -typedefs)"),
@@ -126,6 +136,16 @@ cl::opt<bool> Enums("enums", cl::desc("Display enum types"),
cl::cat(TypeCategory), cl::sub(PrettySubcommand));
cl::opt<bool> Typedefs("typedefs", cl::desc("Display typedef types"),
cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+cl::opt<SymbolSortMode> SymbolOrder(
+ "symbol-order", cl::desc("symbol sort order"),
+ cl::init(SymbolSortMode::None),
+ cl::values(clEnumValN(SymbolSortMode::None, "none",
+ "Undefined / no particular sort order"),
+ clEnumValN(SymbolSortMode::Name, "name", "Sort symbols by name"),
+ clEnumValN(SymbolSortMode::Size, "size",
+ "Sort symbols by size")),
+ cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+
cl::opt<ClassSortMode> ClassOrder(
"class-order", cl::desc("Class sort order"), cl::init(ClassSortMode::None),
cl::values(
@@ -620,6 +640,49 @@ static void diff(StringRef Path1, StringRef Path2) {
ExitOnErr(O->dump());
}
+bool opts::pretty::shouldDumpSymLevel(SymLevel Search) {
+ if (SymTypes.empty())
+ return true;
+ if (llvm::find(SymTypes, Search) != SymTypes.end())
+ return true;
+ if (llvm::find(SymTypes, SymLevel::All) != SymTypes.end())
+ return true;
+ return false;
+}
+
+uint32_t llvm::pdb::getTypeLength(const PDBSymbolData &Symbol) {
+ auto SymbolType = Symbol.getType();
+ const IPDBRawSymbol &RawType = SymbolType->getRawSymbol();
+
+ return RawType.getLength();
+}
+
+bool opts::pretty::compareFunctionSymbols(
+ const std::unique_ptr<PDBSymbolFunc> &F1,
+ const std::unique_ptr<PDBSymbolFunc> &F2) {
+ assert(opts::pretty::SymbolOrder != opts::pretty::SymbolSortMode::None);
+
+ if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::Name)
+ return F1->getName() < F2->getName();
+
+ // Note that we intentionally sort in descending order on length, since
+ // long functions are more interesting than short functions.
+ return F1->getLength() > F2->getLength();
+}
+
+bool opts::pretty::compareDataSymbols(
+ const std::unique_ptr<PDBSymbolData> &F1,
+ const std::unique_ptr<PDBSymbolData> &F2) {
+ assert(opts::pretty::SymbolOrder != opts::pretty::SymbolSortMode::None);
+
+ if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::Name)
+ return F1->getName() < F2->getName();
+
+ // Note that we intentionally sort in descending order on length, since
+ // large types are more interesting than short ones.
+ return getTypeLength(*F1) > getTypeLength(*F2);
+}
+
static void dumpPretty(StringRef Path) {
std::unique_ptr<IPDBSession> Session;
@@ -708,21 +771,42 @@ static void dumpPretty(StringRef Path) {
Printer.NewLine();
WithColor(Printer, PDB_ColorItem::SectionHeader).get() << "---GLOBALS---";
Printer.Indent();
- {
+ if (shouldDumpSymLevel(opts::pretty::SymLevel::Functions)) {
FunctionDumper Dumper(Printer);
auto Functions = GlobalScope->findAllChildren<PDBSymbolFunc>();
- while (auto Function = Functions->getNext()) {
- Printer.NewLine();
- Dumper.start(*Function, FunctionDumper::PointerType::None);
+ if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::None) {
+ while (auto Function = Functions->getNext()) {
+ Printer.NewLine();
+ Dumper.start(*Function, FunctionDumper::PointerType::None);
+ }
+ } else {
+ std::vector<std::unique_ptr<PDBSymbolFunc>> Funcs;
+ while (auto Func = Functions->getNext())
+ Funcs.push_back(std::move(Func));
+ std::sort(Funcs.begin(), Funcs.end(),
+ opts::pretty::compareFunctionSymbols);
+ for (const auto &Func : Funcs) {
+ Printer.NewLine();
+ Dumper.start(*Func, FunctionDumper::PointerType::None);
+ }
}
}
- {
+ if (shouldDumpSymLevel(opts::pretty::SymLevel::Data)) {
auto Vars = GlobalScope->findAllChildren<PDBSymbolData>();
VariableDumper Dumper(Printer);
- while (auto Var = Vars->getNext())
- Dumper.start(*Var);
+ if (opts::pretty::SymbolOrder == opts::pretty::SymbolSortMode::None) {
+ while (auto Var = Vars->getNext())
+ Dumper.start(*Var);
+ } else {
+ std::vector<std::unique_ptr<PDBSymbolData>> Datas;
+ while (auto Var = Vars->getNext())
+ Datas.push_back(std::move(Var));
+ std::sort(Datas.begin(), Datas.end(), opts::pretty::compareDataSymbols);
+ for (const auto &Var : Datas)
+ Dumper.start(*Var);
+ }
}
- {
+ if (shouldDumpSymLevel(opts::pretty::SymLevel::Thunks)) {
auto Thunks = GlobalScope->findAllChildren<PDBSymbolThunk>();
CompilandDumper Dumper(Printer);
while (auto Thunk = Thunks->getNext())
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.h b/tools/llvm-pdbdump/llvm-pdbdump.h
index 8b1dde9399bf..e38b32c6a345 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.h
+++ b/tools/llvm-pdbdump/llvm-pdbdump.h
@@ -14,6 +14,17 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
+#include <memory>
+#include <stdint.h>
+
+namespace llvm {
+namespace pdb {
+class PDBSymbolData;
+class PDBSymbolFunc;
+uint32_t getTypeLength(const PDBSymbolData &Symbol);
+}
+}
+
namespace opts {
namespace pretty {
@@ -29,6 +40,17 @@ enum class ClassSortMode {
PaddingPctImmediate
};
+enum class SymbolSortMode { None, Name, Size };
+
+enum class SymLevel { Functions, Data, Thunks, All };
+
+bool shouldDumpSymLevel(SymLevel Level);
+bool compareFunctionSymbols(
+ const std::unique_ptr<llvm::pdb::PDBSymbolFunc> &F1,
+ const std::unique_ptr<llvm::pdb::PDBSymbolFunc> &F2);
+bool compareDataSymbols(const std::unique_ptr<llvm::pdb::PDBSymbolData> &F1,
+ const std::unique_ptr<llvm::pdb::PDBSymbolData> &F2);
+
extern llvm::cl::opt<bool> Compilands;
extern llvm::cl::opt<bool> Symbols;
extern llvm::cl::opt<bool> Globals;
@@ -45,6 +67,7 @@ extern llvm::cl::list<std::string> ExcludeCompilands;
extern llvm::cl::list<std::string> IncludeTypes;
extern llvm::cl::list<std::string> IncludeSymbols;
extern llvm::cl::list<std::string> IncludeCompilands;
+extern llvm::cl::opt<SymbolSortMode> SymbolOrder;
extern llvm::cl::opt<ClassSortMode> ClassOrder;
extern llvm::cl::opt<uint32_t> SizeThreshold;
extern llvm::cl::opt<uint32_t> PaddingThreshold;
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 049af2c4f076..aca7de840d80 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -1562,6 +1562,14 @@ void COFFDumper::printResourceDirectoryTable(
raw_svector_ostream OS(IDStr);
if (i < Table.NumberOfNameEntries) {
ArrayRef<UTF16> RawEntryNameString = unwrapOrError(RSF.getEntryNameString(Entry));
+ std::vector<UTF16> EndianCorrectedNameString;
+ if (llvm::sys::IsBigEndianHost) {
+ EndianCorrectedNameString.resize(RawEntryNameString.size() + 1);
+ std::copy(RawEntryNameString.begin(), RawEntryNameString.end(),
+ EndianCorrectedNameString.begin() + 1);
+ EndianCorrectedNameString[0] = UNI_UTF16_BYTE_ORDER_MARK_SWAPPED;
+ RawEntryNameString = makeArrayRef(EndianCorrectedNameString);
+ }
std::string EntryNameString;
if (!llvm::convertUTF16ToUTF8String(RawEntryNameString, EntryNameString))
error(object_error::parse_failed);
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 8a9d7bc720c3..cd7244a8f970 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -311,13 +311,6 @@ static void reportError(StringRef Input, std::error_code EC) {
reportError(Twine(Input) + ": " + EC.message());
}
-static void reportError(StringRef Input, StringRef Message) {
- if (Input == "-")
- Input = "<stdin>";
-
- reportError(Twine(Input) + ": " + Message);
-}
-
static void reportError(StringRef Input, Error Err) {
if (Input == "-")
Input = "<stdin>";
@@ -481,11 +474,7 @@ static void dumpArchive(const Archive *Arc) {
Expected<std::unique_ptr<Binary>> ChildOrErr = Child.getAsBinary();
if (!ChildOrErr) {
if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError())) {
- std::string Buf;
- raw_string_ostream OS(Buf);
- logAllUnhandledErrors(ChildOrErr.takeError(), OS, "");
- OS.flush();
- reportError(Arc->getFileName(), Buf);
+ reportError(Arc->getFileName(), ChildOrErr.takeError());
}
continue;
}
@@ -507,11 +496,7 @@ static void dumpMachOUniversalBinary(const MachOUniversalBinary *UBinary) {
if (ObjOrErr)
dumpObject(&*ObjOrErr.get());
else if (auto E = isNotObjectErrorInvalidFileType(ObjOrErr.takeError())) {
- std::string Buf;
- raw_string_ostream OS(Buf);
- logAllUnhandledErrors(ObjOrErr.takeError(), OS, "");
- OS.flush();
- reportError(UBinary->getFileName(), Buf);
+ reportError(UBinary->getFileName(), ObjOrErr.takeError());
}
else if (Expected<std::unique_ptr<Archive>> AOrErr = Obj.getAsArchive())
dumpArchive(&*AOrErr.get());
@@ -524,7 +509,7 @@ static void dumpInput(StringRef File) {
// Attempt to open the binary.
Expected<OwningBinary<Binary>> BinaryOrErr = createBinary(File);
if (!BinaryOrErr)
- reportError(File, errorToErrorCode(BinaryOrErr.takeError()));
+ reportError(File, BinaryOrErr.takeError());
Binary &Binary = *BinaryOrErr.get().getBinary();
if (Archive *Arc = dyn_cast<Archive>(&Binary))
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 75345de50280..ba130ce80be8 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -175,8 +175,7 @@ public:
void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
size_t Size) override {}
- void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
- size_t Size) override {}
+ void deregisterEHFrames() override {}
void preallocateSlab(uint64_t Size) {
std::string Err;
diff --git a/tools/obj2yaml/wasm2yaml.cpp b/tools/obj2yaml/wasm2yaml.cpp
index cc04b995f667..d4d978f028e2 100644
--- a/tools/obj2yaml/wasm2yaml.cpp
+++ b/tools/obj2yaml/wasm2yaml.cpp
@@ -25,6 +25,23 @@ public:
ErrorOr<WasmYAML::Object *> dump();
};
+WasmYAML::Table make_table(const wasm::WasmTable &Table) {
+ WasmYAML::Table T;
+ T.ElemType = Table.ElemType;
+ T.TableLimits.Flags = Table.Limits.Flags;
+ T.TableLimits.Initial = Table.Limits.Initial;
+ T.TableLimits.Maximum = Table.Limits.Maximum;
+ return T;
+}
+
+WasmYAML::Limits make_limits(const wasm::WasmLimits &Limits) {
+ WasmYAML::Limits L;
+ L.Flags = Limits.Flags;
+ L.Initial = Limits.Initial;
+ L.Maximum = Limits.Maximum;
+ return L;
+}
+
ErrorOr<WasmYAML::Object *> WasmDumper::dump() {
auto Y = make_unique<WasmYAML::Object>();
@@ -82,17 +99,26 @@ ErrorOr<WasmYAML::Object *> WasmDumper::dump() {
case wasm::WASM_SEC_IMPORT: {
auto ImportSec = make_unique<WasmYAML::ImportSection>();
for (auto &Import : Obj.imports()) {
- WasmYAML::Import Ex;
- Ex.Module = Import.Module;
- Ex.Field = Import.Field;
- Ex.Kind = Import.Kind;
- if (Ex.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
- Ex.SigIndex = Import.SigIndex;
- } else if (Ex.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
- Ex.GlobalType = Import.GlobalType;
- Ex.GlobalMutable = Import.GlobalMutable;
+ WasmYAML::Import Im;
+ Im.Module = Import.Module;
+ Im.Field = Import.Field;
+ Im.Kind = Import.Kind;
+ switch (Im.Kind) {
+ case wasm::WASM_EXTERNAL_FUNCTION:
+ Im.SigIndex = Import.SigIndex;
+ break;
+ case wasm::WASM_EXTERNAL_GLOBAL:
+ Im.GlobalImport.Type = Import.Global.Type;
+ Im.GlobalImport.Mutable = Import.Global.Mutable;
+ break;
+ case wasm::WASM_EXTERNAL_TABLE:
+ Im.TableImport = make_table(Import.Table);
+ break;
+ case wasm::WASM_EXTERNAL_MEMORY:
+ Im.Memory = make_limits(Import.Memory);
+ break;
}
- ImportSec->Imports.push_back(Ex);
+ ImportSec->Imports.push_back(Im);
}
S = std::move(ImportSec);
break;
@@ -107,25 +133,16 @@ ErrorOr<WasmYAML::Object *> WasmDumper::dump() {
}
case wasm::WASM_SEC_TABLE: {
auto TableSec = make_unique<WasmYAML::TableSection>();
- for (auto &Table : Obj.tables()) {
- WasmYAML::Table T;
- T.ElemType = Table.ElemType;
- T.TableLimits.Flags = Table.Limits.Flags;
- T.TableLimits.Initial = Table.Limits.Initial;
- T.TableLimits.Maximum = Table.Limits.Maximum;
- TableSec->Tables.push_back(T);
+ for (const wasm::WasmTable &Table : Obj.tables()) {
+ TableSec->Tables.push_back(make_table(Table));
}
S = std::move(TableSec);
break;
}
case wasm::WASM_SEC_MEMORY: {
auto MemorySec = make_unique<WasmYAML::MemorySection>();
- for (auto &Memory : Obj.memories()) {
- WasmYAML::Limits L;
- L.Flags = Memory.Flags;
- L.Initial = Memory.Initial;
- L.Maximum = Memory.Maximum;
- MemorySec->Memories.push_back(L);
+ for (const wasm::WasmLimits &Memory : Obj.memories()) {
+ MemorySec->Memories.push_back(make_limits(Memory));
}
S = std::move(MemorySec);
break;
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 40459e559986..c362dff3a3e0 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -385,18 +385,20 @@ int main(int argc, char **argv) {
initializeTarget(Registry);
// For codegen passes, only passes that do IR to IR transformation are
// supported.
+ initializeScalarizeMaskedMemIntrinPass(Registry);
initializeCodeGenPreparePass(Registry);
initializeAtomicExpandPass(Registry);
initializeRewriteSymbolsLegacyPassPass(Registry);
initializeWinEHPreparePass(Registry);
initializeDwarfEHPreparePass(Registry);
- initializeSafeStackPass(Registry);
+ initializeSafeStackLegacyPassPass(Registry);
initializeSjLjEHPreparePass(Registry);
initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
initializeGlobalMergePass(Registry);
initializeInterleavedAccessPass(Registry);
initializeCountingFunctionInserterPass(Registry);
initializeUnreachableBlockElimLegacyPassPass(Registry);
+ initializeExpandReductionsPass(Registry);
#ifdef LINK_POLLY_INTO_TOOLS
polly::initializePollyPasses(Registry);
diff --git a/tools/yaml2obj/yaml2wasm.cpp b/tools/yaml2obj/yaml2wasm.cpp
index eed9f2c4039b..5c8aba33ee80 100644
--- a/tools/yaml2obj/yaml2wasm.cpp
+++ b/tools/yaml2obj/yaml2wasm.cpp
@@ -169,8 +169,15 @@ int WasmWriter::writeSectionContent(raw_ostream &OS,
encodeULEB128(Import.SigIndex, OS);
break;
case wasm::WASM_EXTERNAL_GLOBAL:
- encodeSLEB128(Import.GlobalType, OS);
- writeUint8(OS, Import.GlobalMutable);
+ encodeSLEB128(Import.GlobalImport.Type, OS);
+ writeUint8(OS, Import.GlobalImport.Mutable);
+ break;
+ case wasm::WASM_EXTERNAL_MEMORY:
+ writeLimits(Import.Memory, OS);
+ break;
+ case wasm::WASM_EXTERNAL_TABLE:
+ encodeSLEB128(Import.TableImport.ElemType, OS);
+ writeLimits(Import.TableImport.TableLimits, OS);
break;
default:
errs() << "Unknown import type: " << Import.Kind;
diff --git a/unittests/Analysis/ProfileSummaryInfoTest.cpp b/unittests/Analysis/ProfileSummaryInfoTest.cpp
index 0b4b1de28053..3454474f0376 100644
--- a/unittests/Analysis/ProfileSummaryInfoTest.cpp
+++ b/unittests/Analysis/ProfileSummaryInfoTest.cpp
@@ -162,6 +162,12 @@ TEST_F(ProfileSummaryInfoTest, InstrProf) {
EXPECT_TRUE(PSI.isHotCallSite(CS1, &BFI));
EXPECT_FALSE(PSI.isHotCallSite(CS2, &BFI));
+
+ // Test that adding an MD_prof metadata with a hot count on CS2 does not
+ // change its hotness as it has no effect in instrumented profiling.
+ MDBuilder MDB(M->getContext());
+ CI2->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights({400}));
+ EXPECT_FALSE(PSI.isHotCallSite(CS2, &BFI));
}
TEST_F(ProfileSummaryInfoTest, SampleProf) {
diff --git a/unittests/Analysis/TargetLibraryInfoTest.cpp b/unittests/Analysis/TargetLibraryInfoTest.cpp
index 44c141d6a1e9..9d852cf0301b 100644
--- a/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -470,6 +470,52 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
"declare i32 @isascii(i32)\n"
"declare i32 @isdigit(i32)\n"
"declare i32 @toascii(i32)\n"
+
+ // These functions were extracted from math-finite.h which provides
+ // functions similar to those in math.h, but optimized for handling
+ // finite values only.
+ "declare double @__acos_finite(double)\n"
+ "declare float @__acosf_finite(float)\n"
+ "declare x86_fp80 @__acosl_finite(x86_fp80)\n"
+ "declare double @__acosh_finite(double)\n"
+ "declare float @__acoshf_finite(float)\n"
+ "declare x86_fp80 @__acoshl_finite(x86_fp80)\n"
+ "declare double @__asin_finite(double)\n"
+ "declare float @__asinf_finite(float)\n"
+ "declare x86_fp80 @__asinl_finite(x86_fp80)\n"
+ "declare double @__atan2_finite(double, double)\n"
+ "declare float @__atan2f_finite(float, float)\n"
+ "declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80)\n"
+ "declare double @__atanh_finite(double)\n"
+ "declare float @__atanhf_finite(float)\n"
+ "declare x86_fp80 @__atanhl_finite(x86_fp80)\n"
+ "declare double @__cosh_finite(double)\n"
+ "declare float @__coshf_finite(float)\n"
+ "declare x86_fp80 @__coshl_finite(x86_fp80)\n"
+ "declare double @__exp10_finite(double)\n"
+ "declare float @__exp10f_finite(float)\n"
+ "declare x86_fp80 @__exp10l_finite(x86_fp80)\n"
+ "declare double @__exp2_finite(double)\n"
+ "declare float @__exp2f_finite(float)\n"
+ "declare x86_fp80 @__exp2l_finite(x86_fp80)\n"
+ "declare double @__exp_finite(double)\n"
+ "declare float @__expf_finite(float)\n"
+ "declare x86_fp80 @__expl_finite(x86_fp80)\n"
+ "declare double @__log10_finite(double)\n"
+ "declare float @__log10f_finite(float)\n"
+ "declare x86_fp80 @__log10l_finite(x86_fp80)\n"
+ "declare double @__log2_finite(double)\n"
+ "declare float @__log2f_finite(float)\n"
+ "declare x86_fp80 @__log2l_finite(x86_fp80)\n"
+ "declare double @__log_finite(double)\n"
+ "declare float @__logf_finite(float)\n"
+ "declare x86_fp80 @__logl_finite(x86_fp80)\n"
+ "declare double @__pow_finite(double, double)\n"
+ "declare float @__powf_finite(float, float)\n"
+ "declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80)\n"
+ "declare double @__sinh_finite(double)\n"
+ "declare float @__sinhf_finite(float)\n"
+ "declare x86_fp80 @__sinhl_finite(x86_fp80)\n"
);
for (unsigned FI = 0; FI != LibFunc::NumLibFuncs; ++FI) {
diff --git a/unittests/DebugInfo/CMakeLists.txt b/unittests/DebugInfo/CMakeLists.txt
index dae472bafdd7..e38fff58cae6 100644
--- a/unittests/DebugInfo/CMakeLists.txt
+++ b/unittests/DebugInfo/CMakeLists.txt
@@ -1,3 +1,3 @@
-
+add_subdirectory(CodeView)
add_subdirectory(DWARF)
add_subdirectory(PDB)
diff --git a/unittests/DebugInfo/CodeView/CMakeLists.txt b/unittests/DebugInfo/CodeView/CMakeLists.txt
new file mode 100644
index 000000000000..854182c4efb4
--- /dev/null
+++ b/unittests/DebugInfo/CodeView/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(LLVM_LINK_COMPONENTS
+ DebugInfoCodeView
+ )
+
+set(DebugInfoCodeViewSources
+ RandomAccessVisitorTest.cpp
+ )
+
+add_llvm_unittest(DebugInfoCodeViewTests
+ ${DebugInfoCodeViewSources}
+ )
diff --git a/unittests/DebugInfo/CodeView/ErrorChecking.h b/unittests/DebugInfo/CodeView/ErrorChecking.h
new file mode 100644
index 000000000000..09310883bf58
--- /dev/null
+++ b/unittests/DebugInfo/CodeView/ErrorChecking.h
@@ -0,0 +1,61 @@
+//===- ErrorChecking.h - Helpers for verifying llvm::Errors -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UNITTESTS_DEBUGINFO_CODEVIEW_ERRORCHECKING_H
+#define LLVM_UNITTESTS_DEBUGINFO_CODEVIEW_ERRORCHECKING_H
+
+#define EXPECT_NO_ERROR(Err) \
+ { \
+ auto E = Err; \
+ EXPECT_FALSE(static_cast<bool>(E)); \
+ if (E) \
+ consumeError(std::move(E)); \
+ }
+
+#define EXPECT_ERROR(Err) \
+ { \
+ auto E = Err; \
+ EXPECT_TRUE(static_cast<bool>(E)); \
+ if (E) \
+ consumeError(std::move(E)); \
+ }
+
+#define EXPECT_EXPECTED(Exp) \
+ { \
+ auto E = Exp.takeError(); \
+ EXPECT_FALSE(static_cast<bool>(E)); \
+ if (E) { \
+ consumeError(std::move(E)); \
+ return; \
+ } \
+ }
+
+#define EXPECT_EXPECTED_EQ(Val, Exp) \
+ { \
+ auto Result = Exp; \
+ auto E = Result.takeError(); \
+ EXPECT_FALSE(static_cast<bool>(E)); \
+ if (E) { \
+ consumeError(std::move(E)); \
+ return; \
+ } \
+ EXPECT_EQ(Val, *Result); \
+ }
+
+#define EXPECT_UNEXPECTED(Exp) \
+ { \
+ auto E = Exp.takeError(); \
+ EXPECT_TRUE(static_cast<bool>(E)); \
+ if (E) { \
+ consumeError(std::move(E)); \
+ return; \
+ } \
+ }
+
+#endif
diff --git a/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
new file mode 100644
index 000000000000..fedb5978da81
--- /dev/null
+++ b/unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp
@@ -0,0 +1,353 @@
+//===- llvm/unittest/DebugInfo/CodeView/RandomAccessVisitorTest.cpp -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ErrorChecking.h"
+
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
+#include "llvm/DebugInfo/CodeView/TypeSerializer.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
+#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryItemStream.h"
+#include "llvm/Support/Error.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+namespace llvm {
+namespace codeview {
+inline bool operator==(const ArrayRecord &R1, const ArrayRecord &R2) {
+ if (R1.ElementType != R2.ElementType)
+ return false;
+ if (R1.IndexType != R2.IndexType)
+ return false;
+ if (R1.Name != R2.Name)
+ return false;
+ if (R1.Size != R2.Size)
+ return false;
+ return true;
+}
+inline bool operator!=(const ArrayRecord &R1, const ArrayRecord &R2) {
+ return !(R1 == R2);
+}
+
+inline bool operator==(const CVType &R1, const CVType &R2) {
+ if (R1.Type != R2.Type)
+ return false;
+ if (R1.RecordData != R2.RecordData)
+ return false;
+ return true;
+}
+inline bool operator!=(const CVType &R1, const CVType &R2) {
+ return !(R1 == R2);
+}
+}
+}
+
+namespace llvm {
+template <> struct BinaryItemTraits<CVType> {
+ static size_t length(const CVType &Item) { return Item.length(); }
+ static ArrayRef<uint8_t> bytes(const CVType &Item) { return Item.data(); }
+};
+}
+
+namespace {
+
+class MockCallbacks : public TypeVisitorCallbacks {
+public:
+ virtual Error visitTypeBegin(CVType &CVR, TypeIndex Index) {
+ Indices.push_back(Index);
+ return Error::success();
+ }
+ virtual Error visitKnownRecord(CVType &CVR, ArrayRecord &AR) {
+ VisitedRecords.push_back(AR);
+ RawRecords.push_back(CVR);
+ return Error::success();
+ }
+
+ uint32_t count() const {
+ assert(Indices.size() == RawRecords.size());
+ assert(Indices.size() == VisitedRecords.size());
+ return Indices.size();
+ }
+ std::vector<TypeIndex> Indices;
+ std::vector<CVType> RawRecords;
+ std::vector<ArrayRecord> VisitedRecords;
+};
+
+class RandomAccessVisitorTest : public testing::Test {
+public:
+ RandomAccessVisitorTest() {}
+
+ static void SetUpTestCase() {
+ GlobalState = llvm::make_unique<GlobalTestState>();
+
+ TypeTableBuilder Builder(GlobalState->Allocator);
+
+ uint32_t Offset = 0;
+ for (int I = 0; I < 11; ++I) {
+ ArrayRecord AR(TypeRecordKind::Array);
+ AR.ElementType = TypeIndex::Int32();
+ AR.IndexType = TypeIndex::UInt32();
+ AR.Size = I;
+ std::string Name;
+ raw_string_ostream Stream(Name);
+ Stream << "Array [" << I << "]";
+ AR.Name = GlobalState->Strings.save(Stream.str());
+ GlobalState->Records.push_back(AR);
+ GlobalState->Indices.push_back(Builder.writeKnownType(AR));
+
+ CVType Type(TypeLeafKind::LF_ARRAY, Builder.records().back());
+ GlobalState->TypeVector.push_back(Type);
+
+ GlobalState->AllOffsets.push_back(
+ {GlobalState->Indices.back(), ulittle32_t(Offset)});
+ Offset += Type.length();
+ }
+
+ GlobalState->ItemStream.setItems(GlobalState->TypeVector);
+ GlobalState->TypeArray = VarStreamArray<CVType>(GlobalState->ItemStream);
+ }
+
+ static void TearDownTestCase() { GlobalState.reset(); }
+
+ void SetUp() override {
+ TestState = llvm::make_unique<PerTestState>();
+
+ TestState->Pipeline.addCallbackToPipeline(TestState->Deserializer);
+ TestState->Pipeline.addCallbackToPipeline(TestState->Callbacks);
+ }
+
+ void TearDown() override { TestState.reset(); }
+
+protected:
+ bool ValidateDatabaseRecord(const RandomAccessTypeVisitor &Visitor,
+ uint32_t Index) {
+ TypeIndex TI = TypeIndex::fromArrayIndex(Index);
+ if (!Visitor.database().contains(TI))
+ return false;
+ if (GlobalState->TypeVector[Index] != Visitor.database().getTypeRecord(TI))
+ return false;
+ return true;
+ }
+
+ bool ValidateVisitedRecord(uint32_t VisitationOrder,
+ uint32_t GlobalArrayIndex) {
+ TypeIndex TI = TypeIndex::fromArrayIndex(GlobalArrayIndex);
+ if (TI != TestState->Callbacks.Indices[VisitationOrder])
+ return false;
+
+ if (GlobalState->TypeVector[TI.toArrayIndex()] !=
+ TestState->Callbacks.RawRecords[VisitationOrder])
+ return false;
+
+ if (GlobalState->Records[TI.toArrayIndex()] !=
+ TestState->Callbacks.VisitedRecords[VisitationOrder])
+ return false;
+
+ return true;
+ }
+
+ struct GlobalTestState {
+ GlobalTestState() : Strings(Allocator), ItemStream(llvm::support::little) {}
+
+ BumpPtrAllocator Allocator;
+ StringSaver Strings;
+
+ std::vector<ArrayRecord> Records;
+ std::vector<TypeIndex> Indices;
+ std::vector<TypeIndexOffset> AllOffsets;
+ std::vector<CVType> TypeVector;
+ BinaryItemStream<CVType> ItemStream;
+ VarStreamArray<CVType> TypeArray;
+
+ MutableBinaryByteStream Stream;
+ };
+
+ struct PerTestState {
+ FixedStreamArray<TypeIndexOffset> Offsets;
+
+ TypeVisitorCallbackPipeline Pipeline;
+ TypeDeserializer Deserializer;
+ MockCallbacks Callbacks;
+ };
+
+ FixedStreamArray<TypeIndexOffset>
+ createPartialOffsets(MutableBinaryByteStream &Storage,
+ std::initializer_list<uint32_t> Indices) {
+
+ uint32_t Count = Indices.size();
+ uint32_t Size = Count * sizeof(TypeIndexOffset);
+ uint8_t *Buffer = GlobalState->Allocator.Allocate<uint8_t>(Size);
+ MutableArrayRef<uint8_t> Bytes(Buffer, Size);
+ Storage = MutableBinaryByteStream(Bytes, support::little);
+ BinaryStreamWriter Writer(Storage);
+ for (const auto I : Indices)
+ consumeError(Writer.writeObject(GlobalState->AllOffsets[I]));
+
+ BinaryStreamReader Reader(Storage);
+ FixedStreamArray<TypeIndexOffset> Result;
+ consumeError(Reader.readArray(Result, Count));
+ return Result;
+ }
+
+ static std::unique_ptr<GlobalTestState> GlobalState;
+ std::unique_ptr<PerTestState> TestState;
+};
+
+std::unique_ptr<RandomAccessVisitorTest::GlobalTestState>
+ RandomAccessVisitorTest::GlobalState;
+}
+
+TEST_F(RandomAccessVisitorTest, MultipleVisits) {
+ TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 8});
+ RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
+ GlobalState->TypeVector.size(),
+ TestState->Offsets);
+
+ std::vector<uint32_t> IndicesToVisit = {5, 5, 5};
+
+ for (uint32_t I : IndicesToVisit) {
+ TypeIndex TI = TypeIndex::fromArrayIndex(I);
+ EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+ }
+
+ // [0,8) should be present
+ EXPECT_EQ(8u, Visitor.database().size());
+ for (uint32_t I = 0; I < 8; ++I)
+ EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+
+ // 5, 5, 5
+ EXPECT_EQ(3u, TestState->Callbacks.count());
+ for (auto I : enumerate(IndicesToVisit))
+ EXPECT_TRUE(ValidateVisitedRecord(I.index(), I.value()));
+}
+
+TEST_F(RandomAccessVisitorTest, DescendingWithinChunk) {
+ // Visit multiple items from the same "chunk" in reverse order. In this
+ // example, it's 7 then 4 then 2. At the end, all records from 0 to 7 should
+ // be known by the database, but only 2, 4, and 7 should have been visited.
+ TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 8});
+
+ std::vector<uint32_t> IndicesToVisit = {7, 4, 2};
+
+ RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
+ GlobalState->TypeVector.size(),
+ TestState->Offsets);
+
+ for (uint32_t I : IndicesToVisit) {
+ TypeIndex TI = TypeIndex::fromArrayIndex(I);
+ EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+ }
+
+ // [0, 7]
+ EXPECT_EQ(8u, Visitor.database().size());
+ for (uint32_t I = 0; I < 8; ++I)
+ EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+
+ // 2, 4, 7
+ EXPECT_EQ(3u, TestState->Callbacks.count());
+ for (auto I : enumerate(IndicesToVisit))
+ EXPECT_TRUE(ValidateVisitedRecord(I.index(), I.value()));
+}
+
+TEST_F(RandomAccessVisitorTest, AscendingWithinChunk) {
+ // * Visit multiple items from the same chunk in ascending order, ensuring
+ // that intermediate items are not visited. In the below example, it's
+ // 5 -> 6 -> 7 which come from the [4,8) chunk.
+ TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 8});
+
+ std::vector<uint32_t> IndicesToVisit = {2, 4, 7};
+
+ RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
+ GlobalState->TypeVector.size(),
+ TestState->Offsets);
+
+ for (uint32_t I : IndicesToVisit) {
+ TypeIndex TI = TypeIndex::fromArrayIndex(I);
+ EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+ }
+
+ // [0, 7]
+ EXPECT_EQ(8u, Visitor.database().size());
+ for (uint32_t I = 0; I < 8; ++I)
+ EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+
+ // 2, 4, 7
+ EXPECT_EQ(3u, TestState->Callbacks.count());
+ for (auto &I : enumerate(IndicesToVisit))
+ EXPECT_TRUE(ValidateVisitedRecord(I.index(), I.value()));
+}
+
+TEST_F(RandomAccessVisitorTest, StopPrematurelyInChunk) {
+ // * Don't visit the last item in one chunk, ensuring that visitation stops
+ // at the record you specify, and the chunk is only partially visited.
+ // In the below example, this is tested by visiting 0 and 1 but not 2,
+ // all from the [0,3) chunk.
+ TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 8});
+
+ std::vector<uint32_t> IndicesToVisit = {0, 1, 2};
+
+ RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
+ GlobalState->TypeVector.size(),
+ TestState->Offsets);
+
+ for (uint32_t I : IndicesToVisit) {
+ TypeIndex TI = TypeIndex::fromArrayIndex(I);
+ EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+ }
+
+ // [0, 8) should be visited.
+ EXPECT_EQ(8u, Visitor.database().size());
+ for (uint32_t I = 0; I < 8; ++I)
+ EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+
+ // [0, 2]
+ EXPECT_EQ(3u, TestState->Callbacks.count());
+ for (auto I : enumerate(IndicesToVisit))
+ EXPECT_TRUE(ValidateVisitedRecord(I.index(), I.value()));
+}
+
+TEST_F(RandomAccessVisitorTest, InnerChunk) {
+ // Test that when a request comes from a chunk in the middle of the partial
+ // offsets array, that items from surrounding chunks are not visited or
+ // added to the database.
+ TestState->Offsets = createPartialOffsets(GlobalState->Stream, {0, 4, 9});
+
+ std::vector<uint32_t> IndicesToVisit = {5, 7};
+
+ RandomAccessTypeVisitor Visitor(GlobalState->TypeArray,
+ GlobalState->TypeVector.size(),
+ TestState->Offsets);
+
+ for (uint32_t I : IndicesToVisit) {
+ TypeIndex TI = TypeIndex::fromArrayIndex(I);
+ EXPECT_NO_ERROR(Visitor.visitTypeIndex(TI, TestState->Pipeline));
+ }
+
+ // [4, 9)
+ EXPECT_EQ(5u, Visitor.database().size());
+ for (uint32_t I = 4; I < 9; ++I)
+ EXPECT_TRUE(ValidateDatabaseRecord(Visitor, I));
+
+ // 5, 7
+ EXPECT_EQ(2u, TestState->Callbacks.count());
+ for (auto &I : enumerate(IndicesToVisit))
+ EXPECT_TRUE(ValidateVisitedRecord(I.index(), I.value()));
+}
diff --git a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
index 96214a368dce..362c143c54ef 100644
--- a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
@@ -304,7 +304,7 @@ TEST(ObjectTransformLayerTest, Main) {
return nullptr;
}
void registerEHFrames(uint8_t *, uint64_t, size_t) override {}
- void deregisterEHFrames(uint8_t *, uint64_t, size_t) override {}
+ void deregisterEHFrames() override {}
bool finalizeMemory(std::string *) override { return false; }
};
diff --git a/unittests/ExecutionEngine/Orc/OrcTestCommon.h b/unittests/ExecutionEngine/Orc/OrcTestCommon.h
index 7fb26634c7a7..dff72c6b9d57 100644
--- a/unittests/ExecutionEngine/Orc/OrcTestCommon.h
+++ b/unittests/ExecutionEngine/Orc/OrcTestCommon.h
@@ -101,7 +101,7 @@ class TypeBuilder<DummyStruct, XCompile> {
public:
static StructType *get(LLVMContext &Context) {
return StructType::get(
- TypeBuilder<types::i<32>[256], XCompile>::get(Context), nullptr);
+ TypeBuilder<types::i<32>[256], XCompile>::get(Context));
}
};
diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
index de99c022fb9d..c13a75a5cbfe 100644
--- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
@@ -90,7 +90,8 @@ TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
Objs.push_back(OwningObj.getBinary());
bool DebugSectionSeen = false;
- SectionMemoryManagerWrapper SMMW(DebugSectionSeen);
+ auto SMMW =
+ std::make_shared<SectionMemoryManagerWrapper>(DebugSectionSeen);
auto Resolver =
createLambdaResolver(
[](const std::string &Name) {
@@ -102,7 +103,7 @@ TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
{
// Test with ProcessAllSections = false (the default).
- auto H = ObjLayer.addObjectSet(Objs, &SMMW, &*Resolver);
+ auto H = ObjLayer.addObjectSet(Objs, SMMW, &*Resolver);
ObjLayer.emitAndFinalize(H);
EXPECT_EQ(DebugSectionSeen, false)
<< "Unexpected debug info section";
@@ -112,7 +113,7 @@ TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
{
// Test with ProcessAllSections = true.
ObjLayer.setProcessAllSections(true);
- auto H = ObjLayer.addObjectSet(Objs, &SMMW, &*Resolver);
+ auto H = ObjLayer.addObjectSet(Objs, SMMW, &*Resolver);
ObjLayer.emitAndFinalize(H);
EXPECT_EQ(DebugSectionSeen, true)
<< "Expected debug info section not seen";
@@ -178,14 +179,15 @@ TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoDuplicateFinalization) {
return JITSymbol(nullptr);
});
- SectionMemoryManagerWrapper SMMW;
- ObjLayer.addObjectSet(std::move(Obj1Set), &SMMW, &*Resolver);
- auto H = ObjLayer.addObjectSet(std::move(Obj2Set), &SMMW, &*Resolver);
+ auto SMMW = std::make_shared<SectionMemoryManagerWrapper>();
+ ObjLayer.addObjectSet(std::move(Obj1Set), SMMW, &*Resolver);
+ auto H = ObjLayer.addObjectSet(std::move(Obj2Set), SMMW, &*Resolver);
ObjLayer.emitAndFinalize(H);
-
+ ObjLayer.removeObjectSet(H);
+
// Finalization of module 2 should trigger finalization of module 1.
// Verify that finalize on SMMW is only called once.
- EXPECT_EQ(SMMW.FinalizationCount, 1)
+ EXPECT_EQ(SMMW->FinalizationCount, 1)
<< "Extra call to finalize";
}
@@ -238,14 +240,15 @@ TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoPrematureAllocation) {
std::vector<object::ObjectFile*> Obj2Set;
Obj2Set.push_back(Obj2.getBinary());
- SectionMemoryManagerWrapper SMMW;
+ auto SMMW = std::make_shared<SectionMemoryManagerWrapper>();
NullResolver NR;
- auto H = ObjLayer.addObjectSet(std::move(Obj1Set), &SMMW, &NR);
- ObjLayer.addObjectSet(std::move(Obj2Set), &SMMW, &NR);
+ auto H = ObjLayer.addObjectSet(std::move(Obj1Set), SMMW, &NR);
+ ObjLayer.addObjectSet(std::move(Obj2Set), SMMW, &NR);
ObjLayer.emitAndFinalize(H);
-
+ ObjLayer.removeObjectSet(H);
+
// Only one call to needsToReserveAllocationSpace should have been made.
- EXPECT_EQ(SMMW.NeedsToReserveAllocationSpaceCount, 1)
+ EXPECT_EQ(SMMW->NeedsToReserveAllocationSpaceCount, 1)
<< "More than one call to needsToReserveAllocationSpace "
"(multiple unrelated objects loaded prior to finalization)";
}
diff --git a/unittests/IR/ConstantRangeTest.cpp b/unittests/IR/ConstantRangeTest.cpp
index b22f82154f40..c6c9bf6d6b50 100644
--- a/unittests/IR/ConstantRangeTest.cpp
+++ b/unittests/IR/ConstantRangeTest.cpp
@@ -443,6 +443,11 @@ TEST_F(ConstantRangeTest, Multiply) {
EXPECT_EQ(ConstantRange(APInt(8, 254), APInt(8, 255)).multiply(
ConstantRange(APInt(8, 2), APInt(8, 4))),
ConstantRange(APInt(8, 250), APInt(8, 253)));
+
+ // TODO: This should be return [-2, 0]
+ EXPECT_EQ(ConstantRange(APInt(8, -2)).multiply(
+ ConstantRange(APInt(8, 0), APInt(8, 2))),
+ ConstantRange(APInt(8, -2), APInt(8, 1)));
}
TEST_F(ConstantRangeTest, UMax) {
@@ -703,13 +708,13 @@ TEST(ConstantRange, MakeGuaranteedNoWrapRegion) {
Instruction::Add, ConstantRange(32, /* isFullSet = */ true),
OBO::NoUnsignedWrap);
EXPECT_TRUE(NUWForAllValues.isSingleElement() &&
- NSWForAllValues.getSingleElement()->isMinValue());
+ NUWForAllValues.getSingleElement()->isMinValue());
auto NUWAndNSWForAllValues = ConstantRange::makeGuaranteedNoWrapRegion(
Instruction::Add, ConstantRange(32, /* isFullSet = */ true),
OBO::NoUnsignedWrap | OBO::NoSignedWrap);
EXPECT_TRUE(NUWAndNSWForAllValues.isSingleElement() &&
- NSWForAllValues.getSingleElement()->isMinValue());
+ NUWAndNSWForAllValues.getSingleElement()->isMinValue());
ConstantRange OneToFive(APInt(32, 1), APInt(32, 6));
EXPECT_EQ(ConstantRange::makeGuaranteedNoWrapRegion(
diff --git a/unittests/IR/InstructionsTest.cpp b/unittests/IR/InstructionsTest.cpp
index 7c75aaec1753..b8d398c5cc38 100644
--- a/unittests/IR/InstructionsTest.cpp
+++ b/unittests/IR/InstructionsTest.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Operator.h"
+#include "gmock/gmock-matchers.h"
#include "gtest/gtest.h"
#include <memory>
@@ -740,5 +741,11 @@ TEST(InstructionsTest, SwitchInst) {
EXPECT_EQ(BB1.get(), Handle.getCaseSuccessor());
}
+TEST(InstructionsTest, CommuteShuffleMask) {
+ SmallVector<int, 16> Indices({-1, 0, 7});
+ ShuffleVectorInst::commuteShuffleMask(Indices, 4);
+ EXPECT_THAT(Indices, testing::ContainerEq(ArrayRef<int>({-1, 4, 3})));
+}
+
} // end anonymous namespace
} // end namespace llvm
diff --git a/unittests/IR/TypeBuilderTest.cpp b/unittests/IR/TypeBuilderTest.cpp
index f2dccac001a4..9ba776543d94 100644
--- a/unittests/IR/TypeBuilderTest.cpp
+++ b/unittests/IR/TypeBuilderTest.cpp
@@ -264,23 +264,21 @@ namespace {
TEST(TypeBuilderTest, Extensions) {
LLVMContext Context;
- EXPECT_EQ(PointerType::getUnqual(StructType::get(
- TypeBuilder<int, false>::get(Context),
- TypeBuilder<int *, false>::get(Context),
- TypeBuilder<void *[], false>::get(Context), (void *)nullptr)),
+ EXPECT_EQ(PointerType::getUnqual(
+ StructType::get(TypeBuilder<int, false>::get(Context),
+ TypeBuilder<int *, false>::get(Context),
+ TypeBuilder<void *[], false>::get(Context))),
(TypeBuilder<MyType *, false>::get(Context)));
- EXPECT_EQ(
- PointerType::getUnqual(StructType::get(
- TypeBuilder<types::i<32>, false>::get(Context),
- TypeBuilder<types::i<32> *, false>::get(Context),
- TypeBuilder<types::i<8> *[], false>::get(Context), (void *)nullptr)),
- (TypeBuilder<MyPortableType *, false>::get(Context)));
- EXPECT_EQ(
- PointerType::getUnqual(StructType::get(
- TypeBuilder<types::i<32>, false>::get(Context),
- TypeBuilder<types::i<32> *, false>::get(Context),
- TypeBuilder<types::i<8> *[], false>::get(Context), (void *)nullptr)),
- (TypeBuilder<MyPortableType *, true>::get(Context)));
+ EXPECT_EQ(PointerType::getUnqual(StructType::get(
+ TypeBuilder<types::i<32>, false>::get(Context),
+ TypeBuilder<types::i<32> *, false>::get(Context),
+ TypeBuilder<types::i<8> *[], false>::get(Context))),
+ (TypeBuilder<MyPortableType *, false>::get(Context)));
+ EXPECT_EQ(PointerType::getUnqual(StructType::get(
+ TypeBuilder<types::i<32>, false>::get(Context),
+ TypeBuilder<types::i<32> *, false>::get(Context),
+ TypeBuilder<types::i<8> *[], false>::get(Context))),
+ (TypeBuilder<MyPortableType *, true>::get(Context)));
}
} // anonymous namespace
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 1f677100dcef..f8d3c1c9a8c7 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -36,6 +36,7 @@ add_llvm_unittest(SupportTests
MemoryBufferTest.cpp
MemoryTest.cpp
NativeFormatTests.cpp
+ ParallelTest.cpp
Path.cpp
ProcessTest.cpp
ProgramTest.cpp
diff --git a/unittests/Support/DynamicLibrary/DynamicLibraryTest.cpp b/unittests/Support/DynamicLibrary/DynamicLibraryTest.cpp
index d46eadc9a046..0674a91282a1 100644
--- a/unittests/Support/DynamicLibrary/DynamicLibraryTest.cpp
+++ b/unittests/Support/DynamicLibrary/DynamicLibraryTest.cpp
@@ -23,8 +23,10 @@ using namespace llvm::sys;
extern "C" PIPSQUEAK_EXPORT const char *TestA() { return "ProcessCall"; }
std::string LibPath() {
+ const std::vector<testing::internal::string>& Argvs = testing::internal::GetArgvs();
+ const char *Argv0 = Argvs.size() > 0 ? Argvs[0].c_str() : "DynamicLibraryTests";
void *Ptr = (void*)(intptr_t)TestA;
- std::string Path = fs::getMainExecutable("DynamicLibraryTests", Ptr);
+ std::string Path = fs::getMainExecutable(Argv0, Ptr);
llvm::SmallString<256> Buf(path::parent_path(Path));
path::append(Buf, "PipSqueak.so");
return Buf.str();
diff --git a/unittests/Support/ParallelTest.cpp b/unittests/Support/ParallelTest.cpp
new file mode 100644
index 000000000000..d734e0dd8586
--- /dev/null
+++ b/unittests/Support/ParallelTest.cpp
@@ -0,0 +1,53 @@
+//===- llvm/unittest/Support/ParallelTest.cpp -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Parallel.h unit tests.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Parallel.h"
+#include "gtest/gtest.h"
+#include <array>
+#include <random>
+
+uint32_t array[1024 * 1024];
+
+using namespace llvm;
+
+// Tests below are hanging up on mingw. Investigating.
+#if !defined(__MINGW32__)
+
+TEST(Parallel, sort) {
+ std::mt19937 randEngine;
+ std::uniform_int_distribution<uint32_t> dist;
+
+ for (auto &i : array)
+ i = dist(randEngine);
+
+ sort(parallel::par, std::begin(array), std::end(array));
+ ASSERT_TRUE(std::is_sorted(std::begin(array), std::end(array)));
+}
+
+TEST(Parallel, parallel_for) {
+ // We need to test the case with a TaskSize > 1. We are white-box testing
+ // here. The TaskSize is calculated as (End - Begin) / 1024 at the time of
+ // writing.
+ uint32_t range[2050];
+ std::fill(range, range + 2050, 1);
+ for_each_n(parallel::par, 0, 2049, [&range](size_t I) { ++range[I]; });
+
+ uint32_t expected[2049];
+ std::fill(expected, expected + 2049, 2);
+ ASSERT_TRUE(std::equal(range, range + 2049, expected));
+ // Check that we don't write past the end of the requested range.
+ ASSERT_EQ(range[2049], 1u);
+}
+
+#endif
diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index 426aff47c746..a4bdcb5c79a2 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp
@@ -1047,7 +1047,7 @@ TEST_F(FileSystemTest, MD5) {
SmallString<64> TempPath;
ASSERT_NO_ERROR(fs::createTemporaryFile("prefix", "temp", FD, TempPath));
StringRef Data("abcdefghijklmnopqrstuvwxyz");
- write(FD, Data.data(), Data.size());
+ ASSERT_EQ(write(FD, Data.data(), Data.size()), static_cast<ssize_t>(Data.size()));
lseek(FD, 0, SEEK_SET);
auto Hash = fs::md5_contents(FD);
::close(FD);
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index 2f4ee8636530..83f146dca704 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -296,7 +296,6 @@ protected:
Value* AllocaContent = IBuilder.getInt32(1);
Instruction* Store = IBuilder.CreateStore(AllocaContent, Alloca);
IBuilder.SetCurrentDebugLocation(DebugLoc::get(5, 2, Subprogram));
- Instruction* Terminator = IBuilder.CreateRetVoid();
// Create a local variable around the alloca
auto *IntType = DBuilder.createBasicType("int", 32, dwarf::DW_ATE_signed);
@@ -306,12 +305,25 @@ protected:
auto *DL = DILocation::get(Subprogram->getContext(), 5, 0, Subprogram);
DBuilder.insertDeclare(Alloca, Variable, E, DL, Store);
DBuilder.insertDbgValueIntrinsic(AllocaContent, 0, Variable, E, DL,
- Terminator);
- // Finalize the debug info
+ Entry);
+ // Also create an inlined variable.
+ auto *InlinedSP =
+ DBuilder.createFunction(CU, "inlined", "inlined", File, 8, FuncType,
+ true, true, 9, DINode::FlagZero, false);
+ auto *InlinedVar =
+ DBuilder.createAutoVariable(InlinedSP, "inlined", File, 5, IntType, true);
+ auto *Scope = DBuilder.createLexicalBlock(
+ DBuilder.createLexicalBlockFile(InlinedSP, File), File, 1, 1);
+ auto InlinedDL =
+ DebugLoc::get(9, 4, Scope, DebugLoc::get(5, 2, Subprogram));
+ IBuilder.SetCurrentDebugLocation(InlinedDL);
+ DBuilder.insertDeclare(Alloca, InlinedVar, E, InlinedDL, Store);
+ IBuilder.CreateStore(IBuilder.getInt32(2), Alloca);
+ // Finalize the debug info.
DBuilder.finalize();
+ IBuilder.CreateRetVoid();
-
- // Create another, empty, compile unit
+ // Create another, empty, compile unit.
DIBuilder DBuilder2(*M);
DBuilder2.createCompileUnit(dwarf::DW_LANG_C99,
DBuilder.createFile("extra.c", "/file/dir"),
@@ -345,15 +357,8 @@ TEST_F(CloneFunc, NewFunctionCreated) {
// function, while the original subprogram still points to the old one.
TEST_F(CloneFunc, Subprogram) {
EXPECT_FALSE(verifyModule(*M));
-
- unsigned SubprogramCount = Finder->subprogram_count();
- EXPECT_EQ(1U, SubprogramCount);
-
- auto Iter = Finder->subprograms().begin();
- auto *Sub = cast<DISubprogram>(*Iter);
-
- EXPECT_TRUE(Sub == OldFunc->getSubprogram());
- EXPECT_TRUE(Sub == NewFunc->getSubprogram());
+ EXPECT_EQ(3U, Finder->subprogram_count());
+ EXPECT_NE(NewFunc->getSubprogram(), OldFunc->getSubprogram());
}
// Test that instructions in the old function still belong to it in the
@@ -380,8 +385,8 @@ TEST_F(CloneFunc, InstructionOwnership) {
EXPECT_EQ(OldDL.getCol(), NewDL.getCol());
// But that they belong to different functions
- auto *OldSubprogram = cast<DISubprogram>(OldDL.getScope());
- auto *NewSubprogram = cast<DISubprogram>(NewDL.getScope());
+ auto *OldSubprogram = cast<DISubprogram>(OldDL.getInlinedAtScope());
+ auto *NewSubprogram = cast<DISubprogram>(NewDL.getInlinedAtScope());
EXPECT_EQ(OldFunc->getSubprogram(), OldSubprogram);
EXPECT_EQ(NewFunc->getSubprogram(), NewSubprogram);
}
@@ -416,22 +421,26 @@ TEST_F(CloneFunc, DebugIntrinsics) {
EXPECT_EQ(NewFunc, cast<AllocaInst>(NewIntrin->getAddress())->
getParent()->getParent());
- // Old variable must belong to the old function
- EXPECT_EQ(OldFunc->getSubprogram(),
- cast<DISubprogram>(OldIntrin->getVariable()->getScope()));
- // New variable must belong to the New function
- EXPECT_EQ(NewFunc->getSubprogram(),
- cast<DISubprogram>(NewIntrin->getVariable()->getScope()));
+ if (!OldIntrin->getDebugLoc()->getInlinedAt()) {
+ // Old variable must belong to the old function.
+ EXPECT_EQ(OldFunc->getSubprogram(),
+ cast<DISubprogram>(OldIntrin->getVariable()->getScope()));
+ // New variable must belong to the new function.
+ EXPECT_EQ(NewFunc->getSubprogram(),
+ cast<DISubprogram>(NewIntrin->getVariable()->getScope()));
+ }
} else if (DbgValueInst* OldIntrin = dyn_cast<DbgValueInst>(&OldI)) {
DbgValueInst* NewIntrin = dyn_cast<DbgValueInst>(&NewI);
EXPECT_TRUE(NewIntrin);
- // Old variable must belong to the old function
- EXPECT_EQ(OldFunc->getSubprogram(),
- cast<DISubprogram>(OldIntrin->getVariable()->getScope()));
- // New variable must belong to the New function
- EXPECT_EQ(NewFunc->getSubprogram(),
- cast<DISubprogram>(NewIntrin->getVariable()->getScope()));
+ if (!OldIntrin->getDebugLoc()->getInlinedAt()) {
+ // Old variable must belong to the old function.
+ EXPECT_EQ(OldFunc->getSubprogram(),
+ cast<DISubprogram>(OldIntrin->getVariable()->getScope()));
+ // New variable must belong to the new function.
+ EXPECT_EQ(NewFunc->getSubprogram(),
+ cast<DISubprogram>(NewIntrin->getVariable()->getScope()));
+ }
}
++OldIter;
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index bb2ec2a64e49..f4a760990999 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -77,6 +77,7 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
PrintMethod = Rec->getValueAsString("PrintMethod");
OperandType = Rec->getValueAsString("OperandType");
OperandNamespace = Rec->getValueAsString("OperandNamespace");
+ EncoderMethod = Rec->getValueAsString("EncoderMethod");
} else if (Rec->isSubClassOf("Operand")) {
PrintMethod = Rec->getValueAsString("PrintMethod");
OperandType = Rec->getValueAsString("OperandType");
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 30516ef5d10d..1903f405d859 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -415,7 +415,7 @@ EmitStageAndOperandCycleData(raw_ostream &OS,
BypassTable += " 0, // No itinerary\n";
// For each Itinerary across all processors, add a unique entry to the stages,
- // operand cycles, and pipepine bypess tables. Then add the new Itinerary
+ // operand cycles, and pipeline bypass tables. Then add the new Itinerary
// object with computed offsets to the ProcItinLists result.
unsigned StageCount = 1, OperandCycleCount = 1;
std::map<std::string, unsigned> ItinStageMap, ItinOperandMap;
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 4298bc5763b6..55e75763ad69 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -21,129 +21,6 @@
#include <string>
using namespace llvm;
-
-#define MRM_MAPPING \
- MAP(C0, 64) \
- MAP(C1, 65) \
- MAP(C2, 66) \
- MAP(C3, 67) \
- MAP(C4, 68) \
- MAP(C5, 69) \
- MAP(C6, 70) \
- MAP(C7, 71) \
- MAP(C8, 72) \
- MAP(C9, 73) \
- MAP(CA, 74) \
- MAP(CB, 75) \
- MAP(CC, 76) \
- MAP(CD, 77) \
- MAP(CE, 78) \
- MAP(CF, 79) \
- MAP(D0, 80) \
- MAP(D1, 81) \
- MAP(D2, 82) \
- MAP(D3, 83) \
- MAP(D4, 84) \
- MAP(D5, 85) \
- MAP(D6, 86) \
- MAP(D7, 87) \
- MAP(D8, 88) \
- MAP(D9, 89) \
- MAP(DA, 90) \
- MAP(DB, 91) \
- MAP(DC, 92) \
- MAP(DD, 93) \
- MAP(DE, 94) \
- MAP(DF, 95) \
- MAP(E0, 96) \
- MAP(E1, 97) \
- MAP(E2, 98) \
- MAP(E3, 99) \
- MAP(E4, 100) \
- MAP(E5, 101) \
- MAP(E6, 102) \
- MAP(E7, 103) \
- MAP(E8, 104) \
- MAP(E9, 105) \
- MAP(EA, 106) \
- MAP(EB, 107) \
- MAP(EC, 108) \
- MAP(ED, 109) \
- MAP(EE, 110) \
- MAP(EF, 111) \
- MAP(F0, 112) \
- MAP(F1, 113) \
- MAP(F2, 114) \
- MAP(F3, 115) \
- MAP(F4, 116) \
- MAP(F5, 117) \
- MAP(F6, 118) \
- MAP(F7, 119) \
- MAP(F8, 120) \
- MAP(F9, 121) \
- MAP(FA, 122) \
- MAP(FB, 123) \
- MAP(FC, 124) \
- MAP(FD, 125) \
- MAP(FE, 126) \
- MAP(FF, 127)
-
-// A clone of X86 since we can't depend on something that is generated.
-namespace X86Local {
- enum {
- Pseudo = 0,
- RawFrm = 1,
- AddRegFrm = 2,
- RawFrmMemOffs = 3,
- RawFrmSrc = 4,
- RawFrmDst = 5,
- RawFrmDstSrc = 6,
- RawFrmImm8 = 7,
- RawFrmImm16 = 8,
- MRMDestMem = 32,
- MRMSrcMem = 33,
- MRMSrcMem4VOp3 = 34,
- MRMSrcMemOp4 = 35,
- MRMXm = 39,
- MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43,
- MRM4m = 44, MRM5m = 45, MRM6m = 46, MRM7m = 47,
- MRMDestReg = 48,
- MRMSrcReg = 49,
- MRMSrcReg4VOp3 = 50,
- MRMSrcRegOp4 = 51,
- MRMXr = 55,
- MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59,
- MRM4r = 60, MRM5r = 61, MRM6r = 62, MRM7r = 63,
-#define MAP(from, to) MRM_##from = to,
- MRM_MAPPING
-#undef MAP
- };
-
- enum {
- OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6
- };
-
- enum {
- PS = 1, PD = 2, XS = 3, XD = 4
- };
-
- enum {
- VEX = 1, XOP = 2, EVEX = 3
- };
-
- enum {
- OpSize16 = 1, OpSize32 = 2
- };
-
- enum {
- AdSize16 = 1, AdSize32 = 2, AdSize64 = 3
- };
-
- enum {
- VEX_W0 = 0, VEX_W1 = 1, VEX_WIG = 2
- };
-}
-
using namespace X86Disassembler;
/// byteFromBitsInit - Extracts a value at most 8 bits in width from a BitsInit.
@@ -890,7 +767,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
case X86Local::MRM6m: case X86Local::MRM7m:
filter = new ExtendedFilter(false, Form - X86Local::MRM0m);
break;
- MRM_MAPPING
+ X86_INSTR_MRM_MAPPING
filter = new ExactFilter(0xC0 + Form - X86Local::MRM_C0); \
break;
} // switch (Form)
diff --git a/utils/TableGen/X86RecognizableInstr.h b/utils/TableGen/X86RecognizableInstr.h
index 91ed928540c3..7fe731ec8b1c 100644
--- a/utils/TableGen/X86RecognizableInstr.h
+++ b/utils/TableGen/X86RecognizableInstr.h
@@ -24,6 +24,128 @@
namespace llvm {
+#define X86_INSTR_MRM_MAPPING \
+ MAP(C0, 64) \
+ MAP(C1, 65) \
+ MAP(C2, 66) \
+ MAP(C3, 67) \
+ MAP(C4, 68) \
+ MAP(C5, 69) \
+ MAP(C6, 70) \
+ MAP(C7, 71) \
+ MAP(C8, 72) \
+ MAP(C9, 73) \
+ MAP(CA, 74) \
+ MAP(CB, 75) \
+ MAP(CC, 76) \
+ MAP(CD, 77) \
+ MAP(CE, 78) \
+ MAP(CF, 79) \
+ MAP(D0, 80) \
+ MAP(D1, 81) \
+ MAP(D2, 82) \
+ MAP(D3, 83) \
+ MAP(D4, 84) \
+ MAP(D5, 85) \
+ MAP(D6, 86) \
+ MAP(D7, 87) \
+ MAP(D8, 88) \
+ MAP(D9, 89) \
+ MAP(DA, 90) \
+ MAP(DB, 91) \
+ MAP(DC, 92) \
+ MAP(DD, 93) \
+ MAP(DE, 94) \
+ MAP(DF, 95) \
+ MAP(E0, 96) \
+ MAP(E1, 97) \
+ MAP(E2, 98) \
+ MAP(E3, 99) \
+ MAP(E4, 100) \
+ MAP(E5, 101) \
+ MAP(E6, 102) \
+ MAP(E7, 103) \
+ MAP(E8, 104) \
+ MAP(E9, 105) \
+ MAP(EA, 106) \
+ MAP(EB, 107) \
+ MAP(EC, 108) \
+ MAP(ED, 109) \
+ MAP(EE, 110) \
+ MAP(EF, 111) \
+ MAP(F0, 112) \
+ MAP(F1, 113) \
+ MAP(F2, 114) \
+ MAP(F3, 115) \
+ MAP(F4, 116) \
+ MAP(F5, 117) \
+ MAP(F6, 118) \
+ MAP(F7, 119) \
+ MAP(F8, 120) \
+ MAP(F9, 121) \
+ MAP(FA, 122) \
+ MAP(FB, 123) \
+ MAP(FC, 124) \
+ MAP(FD, 125) \
+ MAP(FE, 126) \
+ MAP(FF, 127)
+
+// A clone of X86 since we can't depend on something that is generated.
+namespace X86Local {
+ enum {
+ Pseudo = 0,
+ RawFrm = 1,
+ AddRegFrm = 2,
+ RawFrmMemOffs = 3,
+ RawFrmSrc = 4,
+ RawFrmDst = 5,
+ RawFrmDstSrc = 6,
+ RawFrmImm8 = 7,
+ RawFrmImm16 = 8,
+ MRMDestMem = 32,
+ MRMSrcMem = 33,
+ MRMSrcMem4VOp3 = 34,
+ MRMSrcMemOp4 = 35,
+ MRMXm = 39,
+ MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43,
+ MRM4m = 44, MRM5m = 45, MRM6m = 46, MRM7m = 47,
+ MRMDestReg = 48,
+ MRMSrcReg = 49,
+ MRMSrcReg4VOp3 = 50,
+ MRMSrcRegOp4 = 51,
+ MRMXr = 55,
+ MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59,
+ MRM4r = 60, MRM5r = 61, MRM6r = 62, MRM7r = 63,
+#define MAP(from, to) MRM_##from = to,
+ X86_INSTR_MRM_MAPPING
+#undef MAP
+ };
+
+ enum {
+ OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6
+ };
+
+ enum {
+ PS = 1, PD = 2, XS = 3, XD = 4
+ };
+
+ enum {
+ VEX = 1, XOP = 2, EVEX = 3
+ };
+
+ enum {
+ OpSize16 = 1, OpSize32 = 2
+ };
+
+ enum {
+ AdSize16 = 1, AdSize32 = 2, AdSize64 = 3
+ };
+
+ enum {
+ VEX_W0 = 0, VEX_W1 = 1, VEX_WIG = 2
+ };
+}
+
namespace X86Disassembler {
/// RecognizableInstr - Encapsulates all information required to decode a single
diff --git a/utils/git-svn/git-llvm b/utils/git-svn/git-llvm
index c2eaa5b6e640..55d3129c4a82 100755
--- a/utils/git-svn/git-llvm
+++ b/utils/git-svn/git-llvm
@@ -205,21 +205,26 @@ def fix_eol_style_native(rev, sr, svn_sr_path):
# Use ignore_errors because 'svn propget' prints errors if the file doesn't
# have the named property. There doesn't seem to be a way to suppress that.
eol_props = svn(svn_sr_path, 'propget', 'svn:eol-style', *files,
- ignore_errors=True).split('\n')
+ ignore_errors=True)
crlf_files = []
- for eol_prop in eol_props:
- # Remove spare CR.
- eol_prop = eol_prop.strip('\r')
- if not eol_prop:
- continue
- prop_parts = eol_prop.rsplit(' - ', 1)
- if len(prop_parts) != 2:
- eprint("unable to parse svn propget line:")
- eprint(eol_prop)
- continue
- (f, eol_style) = prop_parts
- if eol_style == 'native':
- crlf_files.append(f)
+ if len(files) == 1:
+ # No need to split propget output on ' - ' when we have one file.
+ if eol_props.strip() == 'native':
+ crlf_files = files
+ else:
+ for eol_prop in eol_props.split('\n'):
+ # Remove spare CR.
+ eol_prop = eol_prop.strip('\r')
+ if not eol_prop:
+ continue
+ prop_parts = eol_prop.rsplit(' - ', 1)
+ if len(prop_parts) != 2:
+ eprint("unable to parse svn propget line:")
+ eprint(eol_prop)
+ continue
+ (f, eol_style) = prop_parts
+ if eol_style == 'native':
+ crlf_files.append(f)
# Reformat all files with native SVN line endings to Unix format. SVN knows
# files with native line endings are text files. It will commit just the
# diff, and not a mass line ending change.
diff --git a/utils/release/build_llvm_package.bat b/utils/release/build_llvm_package.bat
index eca74347cf3e..79871781211a 100755
--- a/utils/release/build_llvm_package.bat
+++ b/utils/release/build_llvm_package.bat
@@ -10,7 +10,8 @@ REM Prerequisites:
REM
REM Visual Studio 2017, CMake, Ninja, SVN, GNUWin32, SWIG, Python 3,
REM NSIS with the strlen_8192 patch,
-REM Visual Studio 2017 SDK (for the clang-format plugin).
+REM Visual Studio 2017 SDK and Nuget (for the clang-format plugin),
+REM Perl (for the OpenMP run-time).
REM
REM
REM For LLDB, SWIG version <= 3.0.8 needs to be used to work around
@@ -20,9 +21,8 @@ REM https://github.com/swig/swig/issues/769
REM You need to modify the paths below:
set vsdevcmd=C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\Common7\Tools\VsDevCmd.bat
-set python32_dir=C:\Users\hwennborg\AppData\Local\Programs\Python\Python35-32
-set python64_dir=C:\Users\hwennborg\AppData\Local\Programs\Python\Python35
-set PATH=%PATH%;c:\gnuwin32\bin
+set python32_dir=C:\Users\%USER%\AppData\Local\Programs\Python\Python35-32
+set python64_dir=C:\Users\%USER%\AppData\Local\Programs\Python\Python35
set revision=%1
set branch=trunk
diff --git a/utils/vscode/README b/utils/vscode/README
new file mode 100644
index 000000000000..6febb5e3c100
--- /dev/null
+++ b/utils/vscode/README
@@ -0,0 +1,18 @@
+This directory contains a "bundle" for doing syntax highlighting of TableGen
+files for the Microsoft VSCode editor. The highlighting follows that done by
+the TextMate "C" bundle as it is a translation of the textmate bundle to VSCode
+using the "yo code" npm package. Currently, keywords, comments, and strings are
+highlighted.
+
+This colorizer was generate by the vscode-generator tool "Yo Code"
+(https://github.com/Microsoft/vscode-generator-code) from the existing TableGen
+text TableGen.tmLanguage syntax colorizer in utils/textmate. This README was
+copied from utils/textmate/README.
+
+To install this VSCode .td file colorizer, copy it to the following locations
+per your Operating System:
+
+ - Windows: %USERPROFILE%\.vscode\extensions
+ - Mac: ~/.vscode/extensions
+ - Linux: ~/.vscode/extensions
+
diff --git a/utils/vscode/tablegen/.vscode/launch.json b/utils/vscode/tablegen/.vscode/launch.json
new file mode 100644
index 000000000000..8384213de75f
--- /dev/null
+++ b/utils/vscode/tablegen/.vscode/launch.json
@@ -0,0 +1,13 @@
+// A launch configuration that launches the extension inside a new window
+{
+ "version": "0.1.0",
+ "configurations": [
+ {
+ "name": "Launch Extension",
+ "type": "extensionHost",
+ "request": "launch",
+ "runtimeExecutable": "${execPath}",
+ "args": ["--extensionDevelopmentPath=${workspaceRoot}" ]
+ }
+ ]
+} \ No newline at end of file
diff --git a/utils/vscode/tablegen/CHANGELOG.md b/utils/vscode/tablegen/CHANGELOG.md
new file mode 100644
index 000000000000..4cedbb953a97
--- /dev/null
+++ b/utils/vscode/tablegen/CHANGELOG.md
@@ -0,0 +1,4 @@
+# Change Log
+
+- Initial release
+
diff --git a/utils/vscode/tablegen/README.md b/utils/vscode/tablegen/README.md
new file mode 100644
index 000000000000..e726004edf78
--- /dev/null
+++ b/utils/vscode/tablegen/README.md
@@ -0,0 +1,13 @@
+# tablegen README
+
+This VSCode colorizer extension is a translation of the textmate bunble to
+VSCode using the "yo code" npm package. Currently, keywords, comments, and
+strings are highlighted.
+
+To install this VSCode .td file colorizer, copy it to the following locations
+per your Operating System:
+
+ - Windows: %USERPROFILE%\.vscode\extensions
+ - Mac: ~/.vscode/extensions
+ - Linux: ~/.vscode/extensions
+
diff --git a/utils/vscode/tablegen/language-configuration.json b/utils/vscode/tablegen/language-configuration.json
new file mode 100644
index 000000000000..aa2571000769
--- /dev/null
+++ b/utils/vscode/tablegen/language-configuration.json
@@ -0,0 +1,30 @@
+{
+ "comments": {
+ // symbol used for single line comment. Remove this entry if your language does not support line comments
+ "lineComment": "//",
+ // symbols used for start and end a block comment. Remove this entry if your language does not support block comments
+ "blockComment": [ "/*", "*/" ]
+ },
+ // symbols used as brackets
+ "brackets": [
+ ["{", "}"],
+ ["[", "]"],
+ ["(", ")"]
+ ],
+ // symbols that are auto closed when typing
+ "autoClosingPairs": [
+ ["{", "}"],
+ ["[", "]"],
+ ["(", ")"],
+ ["\"", "\""],
+ ["'", "'"]
+ ],
+ // symbols that that can be used to surround a selection
+ "surroundingPairs": [
+ ["{", "}"],
+ ["[", "]"],
+ ["(", ")"],
+ ["\"", "\""],
+ ["'", "'"]
+ ]
+} \ No newline at end of file
diff --git a/utils/vscode/tablegen/package.json b/utils/vscode/tablegen/package.json
new file mode 100644
index 000000000000..efd32accf139
--- /dev/null
+++ b/utils/vscode/tablegen/package.json
@@ -0,0 +1,26 @@
+{
+ "name": "tablegen",
+ "displayName": "TableGen",
+ "description": "VSCode Language Colorizer for LLVM's TableGen language.",
+ "version": "0.0.1",
+ "publisher": "llvm",
+ "engines": {
+ "vscode": "^1.12.0"
+ },
+ "categories": [
+ "Languages"
+ ],
+ "contributes": {
+ "languages": [{
+ "id": "tablegen",
+ "aliases": ["TableGen", "tablegen"],
+ "extensions": [".td"],
+ "configuration": "./language-configuration.json"
+ }],
+ "grammars": [{
+ "language": "tablegen",
+ "scopeName": "source.tablegen",
+ "path": "./syntaxes/TableGen.tmLanguage"
+ }]
+ }
+} \ No newline at end of file
diff --git a/utils/vscode/tablegen/syntaxes/TableGen.tmLanguage b/utils/vscode/tablegen/syntaxes/TableGen.tmLanguage
new file mode 100644
index 000000000000..f3cf2d618fd5
--- /dev/null
+++ b/utils/vscode/tablegen/syntaxes/TableGen.tmLanguage
@@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+ <key>fileTypes</key>
+ <array><string>td</string></array>
+ <key>foldingStartMarker</key>
+ <string>/\*\*|\{\s*$</string>
+ <key>foldingStopMarker</key>
+ <string>\*\*/|^\s*\}</string>
+ <key>name</key>
+ <string>TableGen</string>
+ <key>patterns</key>
+ <array>
+ <dict>
+ <key>include</key>
+ <string>#comments</string>
+ </dict>
+ <dict>
+ <key>match</key>
+ <string>\b(def|let|in|code|dag|string|list|bits|bit|field|include|defm|foreach|class|multiclass|int)\b</string>
+ <key>name</key>
+ <string>keyword.control.tablegen</string>
+ </dict>
+ <dict>
+ <key>begin</key>
+ <string>"</string>
+ <key>end</key>
+ <string>"</string>
+ <key>name</key>
+ <string>string.quoted.double.untitled</string>
+ <key>patterns</key>
+ <array>
+ <dict>
+ <key>match</key>
+ <string>\\.</string>
+ <key>name</key>
+ <string>constant.character.escape.tablegen</string>
+ </dict>
+ </array>
+ </dict>
+ </array>
+ <key>repository</key>
+ <dict>
+ <key>comments</key>
+ <dict>
+ <key>patterns</key>
+ <array>
+ <dict>
+ <key>captures</key>
+ <dict>
+ <key>1</key>
+ <dict>
+ <key>name</key>
+ <string>meta.toc-list.banner.block.tablegen</string>
+ </dict>
+ </dict>
+ <key>match</key>
+ <string>^/\* =(\s*.*?)\s*= \*/$\n?</string>
+ <key>name</key>
+ <string>comment.block.tablegen</string>
+ </dict>
+ <dict>
+ <key>begin</key>
+ <string>/\*</string>
+ <key>captures</key>
+ <dict>
+ <key>0</key>
+ <dict>
+ <key>name</key>
+ <string>punctuation.definition.comment.tablegen</string>
+ </dict>
+ </dict>
+ <key>end</key>
+ <string>\*/</string>
+ <key>name</key>
+ <string>comment.block.tablegen</string>
+ </dict>
+ <dict>
+ <key>match</key>
+ <string>\*/.*\n</string>
+ <key>name</key>
+ <string>invalid.illegal.stray-comment-end.tablegen</string>
+ </dict>
+ <dict>
+ <key>captures</key>
+ <dict>
+ <key>1</key>
+ <dict>
+ <key>name</key>
+ <string>meta.toc-list.banner.line.tablegen</string>
+ </dict>
+ </dict>
+ <key>match</key>
+ <string>^// =(\s*.*?)\s*=\s*$\n?</string>
+ <key>name</key>
+ <string>comment.line.banner.tablegen</string>
+ </dict>
+ <dict>
+ <key>begin</key>
+ <string>//</string>
+ <key>beginCaptures</key>
+ <dict>
+ <key>0</key>
+ <dict>
+ <key>name</key>
+ <string>punctuation.definition.comment.tablegen</string>
+ </dict>
+ </dict>
+ <key>end</key>
+ <string>$\n?</string>
+ <key>name</key>
+ <string>comment.line.double-slash.tablegen</string>
+ <key>patterns</key>
+ <array>
+ <dict>
+ <key>match</key>
+ <string>(?&gt;\\\s*\n)</string>
+ <key>name</key>
+ <string>punctuation.separator.continuation.tablegen</string>
+ </dict>
+ </array>
+ </dict>
+ </array>
+ </dict>
+ </dict>
+ <key>scopeName</key>
+ <string>source.tablegen</string>
+ <key>uuid</key>
+ <string>3A090BFC-E74B-4993-8DAE-7CCF6D238A32</string>
+</dict>
+</plist>
diff --git a/utils/vscode/tablegen/vsc-extension-quickstart.md b/utils/vscode/tablegen/vsc-extension-quickstart.md
new file mode 100644
index 000000000000..abfbfdb70214
--- /dev/null
+++ b/utils/vscode/tablegen/vsc-extension-quickstart.md
@@ -0,0 +1,27 @@
+# Welcome to your VS Code Extension
+
+## What's in the folder
+* This folder contains all of the files necessary for your extension
+* `package.json` - this is the manifest file in which you declare your language support and define
+the location of the grammar file that has been copied into you extension.
+* `syntaxes/TableGen.tmLanguage` - this is the Text mate grammar file that is used for tokenization
+* `language-configuration.json` - this the language configuration, defining the tokens that are used for
+comments and brackets.
+
+## Get up and running straight away
+* Make sure the language configuration settings in `language-configuration.json` are accurate
+* press `F5` to open a new window with your extension loaded
+* create a new file with a file name suffix matching your language
+* verify that syntax highlight works and that the language configuration settings are working
+
+## Make changes
+* you can relaunch the extension from the debug toolbar after making changes to the files listed above
+* you can also reload (`Ctrl+R` or `Cmd+R` on Mac) the VS Code window with your extension to load your changes
+
+## Add more language features
+* To add features such as intellisense, hovers and validators check out the VS Code extenders documentation at
+https://code.visualstudio.com/docs
+
+## Install your extension
+* To start using your extension with Visual Studio Code copy it into the `<user home>/.vscode/extensions` folder and restart Code.
+* To share your extension with the world, read on https://code.visualstudio.com/docs about publishing an extension. \ No newline at end of file